diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml
new file mode 100644
index 00000000..75dac32c
--- /dev/null
+++ b/.github/workflows/accuracy-tests.yml
@@ -0,0 +1,48 @@
+name: Accuracy Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [labeled]
+
+jobs:
+  run-accuracy-tests:
+    name: Run Accuracy Tests
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests')
+    env:
+      MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }}
+      MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }}
+      MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }}
+      MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }}
+      MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }}
+      MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }}
+      MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }}
+      MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }}
+    steps:
+      - uses: GitHubSecurityLab/actions-permissions/monitor@v1
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version-file: package.json
+          cache: "npm"
+      - name: Install dependencies
+        run: npm ci
+      - name: Run accuracy tests
+        run: ./scripts/run-accuracy-tests.sh
+      - name: Upload accuracy test summary
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: accuracy-test-summary
+          path: .accuracy/tests-summary.html
+      - name: Comment summary on PR
+        if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          path: .accuracy/tests-summary.html
diff --git a/.gitignore b/.gitignore
index 4e3f7a54..49550e27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ state.json
 
 tests/tmp
 coverage
+# Generated assets by accuracy runs
+.accuracy
diff --git a/package-lock.json b/package-lock.json
index 29132ba3..a3bf47c4 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -30,7 +30,11 @@
         "mongodb-mcp-server": "dist/index.js"
       },
       "devDependencies": {
+        "@ai-sdk/anthropic": "^1.2.12",
+        "@ai-sdk/azure": "^1.3.23",
+        "@ai-sdk/openai": "^1.3.22",
         "@eslint/js": "^9.30.1",
+        "@himanshusinghs/google": "^1.2.11",
         "@jest/globals": "^30.0.4",
         "@modelcontextprotocol/inspector": "^0.16.0",
         "@redocly/cli": "^1.34.4",
@@ -38,6 +42,7 @@
         "@types/node": "^24.0.12",
         "@types/simple-oauth2": "^5.0.7",
         "@types/yargs-parser": "^21.0.3",
+        "ai": "^4.3.16",
         "eslint": "^9.30.1",
         "eslint-config-prettier": "^10.1.5",
         "eslint-plugin-jest": "^29.0.1",
@@ -46,20 +51,153 @@
         "jest": "^30.0.4",
         "jest-environment-node": "^30.0.4",
         "jest-extended": "^6.0.0",
+        "microdiff": "^1.5.0",
         "mongodb-runner": "^5.9.2",
+        "ollama-ai-provider": "^1.2.0",
         "openapi-types": "^12.1.3",
         "openapi-typescript": "^7.8.0",
         "prettier": "^3.6.2",
+        "simple-git": "^3.28.0",
         "ts-jest": "^29.4.0",
         "tsx": "^4.20.3",
         "typescript": "^5.8.3",
         "typescript-eslint": "^8.36.0",
+        "uuid": "^11.1.0",
         "yaml": "^2.8.0"
       },
       "engines": {
         "node": ">=20.10.0"
       }
     },
+    "@himanshusinghs/ai-sdk-google": {
+      "extraneous": true
+    },
+    "node_modules/@ai-sdk/anthropic": {
+      "version": "1.2.12",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.12.tgz",
+      "integrity": "sha512-YSzjlko7JvuiyQFmI9RN1tNZdEiZxc+6xld/0tq/VkJaHpEzGAb1yiNxxvmYVcjvfu/PcvCxAAYXmTYQQ63IHQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.3",
+        "@ai-sdk/provider-utils": "2.2.8"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@ai-sdk/azure": {
+      "version": "1.3.23",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz",
+      "integrity": "sha512-vpsaPtU24RBVk/IMM5UylR/N4RtAuL2NZLWc7LJ3tvMTHu6pI46a7w+1qIwR3F6yO9ehWR8qvfLaBefJNFxaVw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/openai": "1.3.22",
+        "@ai-sdk/provider": "1.1.3",
+        "@ai-sdk/provider-utils": "2.2.8"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@ai-sdk/openai": {
+      "version": "1.3.22",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.22.tgz",
+      "integrity": "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.3",
+        "@ai-sdk/provider-utils": "2.2.8"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@ai-sdk/provider": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz",
+      "integrity": "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "json-schema": "^0.4.0"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@ai-sdk/provider-utils": {
+      "version": "2.2.8",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz",
+      "integrity": "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.3",
+        "nanoid": "^3.3.8",
+        "secure-json-parse": "^2.7.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.23.8"
+      }
+    },
+    "node_modules/@ai-sdk/react": {
+      "version": "1.2.12",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.12.tgz",
+      "integrity": "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider-utils": "2.2.8",
+        "@ai-sdk/ui-utils": "1.2.11",
+        "swr": "^2.2.5",
+        "throttleit": "2.1.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "react": "^18 || ^19 || ^19.0.0-rc",
+        "zod": "^3.23.8"
+      },
+      "peerDependenciesMeta": {
+        "zod": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@ai-sdk/ui-utils": {
+      "version": "1.2.11",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz",
+      "integrity": "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.3",
+        "@ai-sdk/provider-utils": "2.2.8",
+        "zod-to-json-schema": "^3.24.1"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.23.8"
+      }
+    },
     "node_modules/@ampproject/remapping": {
       "version": "2.3.0",
       "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz",
@@ -2090,6 +2228,54 @@
         "@hapi/hoek": "^11.0.2"
       }
     },
+    "node_modules/@himanshusinghs/google": {
+      "version": "1.2.11",
+      "resolved": "https://registry.npmjs.org/@himanshusinghs/google/-/google-1.2.11.tgz",
+      "integrity": "sha512-SKTFxwN9PpUHVrppFod8sF1jqys5azzsgcBVrSbc7VaazmVEnBxHQlv5/yfeZFjD3ly5Mw+AJdFfC0bxwdWBNg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.2",
+        "@ai-sdk/provider-utils": "2.2.6"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.2.tgz",
+      "integrity": "sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "json-schema": "^0.4.0"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider-utils": {
+      "version": "2.2.6",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.6.tgz",
+      "integrity": "sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.2",
+        "nanoid": "^3.3.8",
+        "secure-json-parse": "^2.7.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.23.8"
+      }
+    },
     "node_modules/@humanfs/core": {
       "version": "0.19.1",
       "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
@@ -2929,6 +3115,23 @@
         "jsep": "^0.4.0||^1.0.0"
       }
     },
+    "node_modules/@kwsites/file-exists": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@kwsites/file-exists/-/file-exists-1.1.1.tgz",
+      "integrity": "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "debug": "^4.1.1"
+      }
+    },
+    "node_modules/@kwsites/promise-deferred": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@kwsites/promise-deferred/-/promise-deferred-1.1.1.tgz",
+      "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@modelcontextprotocol/inspector": {
       "version": "0.16.0",
       "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.0.tgz",
@@ -5424,6 +5627,19 @@
         "node": ">=18.0.0"
       }
     },
+    "node_modules/@smithy/middleware-retry/node_modules/uuid": {
+      "version": "9.0.1",
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
+      "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
+      "funding": [
+        "https://github.com/sponsors/broofa",
+        "https://github.com/sponsors/ctavan"
+      ],
+      "license": "MIT",
+      "bin": {
+        "uuid": "dist/bin/uuid"
+      }
+    },
     "node_modules/@smithy/middleware-serde": {
       "version": "4.0.3",
       "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.0.3.tgz",
@@ -5906,6 +6122,13 @@
         "@babel/types": "^7.20.7"
       }
     },
+    "node_modules/@types/diff-match-patch": {
+      "version": "1.0.36",
+      "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz",
+      "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@types/estree": {
       "version": "1.0.7",
       "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz",
@@ -6660,6 +6883,33 @@
         "node": ">= 14"
       }
     },
+    "node_modules/ai": {
+      "version": "4.3.16",
+      "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.16.tgz",
+      "integrity": "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "1.1.3",
+        "@ai-sdk/provider-utils": "2.2.8",
+        "@ai-sdk/react": "1.2.12",
+        "@ai-sdk/ui-utils": "1.2.11",
+        "@opentelemetry/api": "1.9.0",
+        "jsondiffpatch": "0.6.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "react": "^18 || ^19 || ^19.0.0-rc",
+        "zod": "^3.23.8"
+      },
+      "peerDependenciesMeta": {
+        "react": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/ajv": {
       "version": "6.12.6",
       "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
@@ -8376,6 +8626,16 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/dequal": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/destroy": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
@@ -8423,6 +8683,13 @@
         "node": ">=0.3.1"
       }
     },
+    "node_modules/diff-match-patch": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz",
+      "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==",
+      "dev": true,
+      "license": "Apache-2.0"
+    },
     "node_modules/diff-sequences": {
       "version": "29.6.3",
       "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz",
@@ -11803,6 +12070,13 @@
         "foreach": "^2.0.4"
       }
     },
+    "node_modules/json-schema": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
+      "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
+      "dev": true,
+      "license": "(AFL-2.1 OR BSD-3-Clause)"
+    },
     "node_modules/json-schema-traverse": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
@@ -11830,6 +12104,37 @@
         "node": ">=6"
       }
     },
+    "node_modules/jsondiffpatch": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz",
+      "integrity": "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/diff-match-patch": "^1.0.36",
+        "chalk": "^5.3.0",
+        "diff-match-patch": "^1.0.5"
+      },
+      "bin": {
+        "jsondiffpatch": "bin/jsondiffpatch.js"
+      },
+      "engines": {
+        "node": "^18.0.0 || >=20.0.0"
+      }
+    },
+    "node_modules/jsondiffpatch/node_modules/chalk": {
+      "version": "5.4.1",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz",
+      "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^12.17.0 || ^14.13 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
     "node_modules/jsonpath-plus": {
       "version": "10.3.0",
       "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
@@ -12132,6 +12437,13 @@
         "node": ">= 0.6"
       }
     },
+    "node_modules/microdiff": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz",
+      "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/micromatch": {
       "version": "4.0.8",
       "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
@@ -13011,6 +13323,29 @@
         "node": "^10.13.0 || >=12.0.0"
       }
     },
+    "node_modules/ollama-ai-provider": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/ollama-ai-provider/-/ollama-ai-provider-1.2.0.tgz",
+      "integrity": "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@ai-sdk/provider": "^1.0.0",
+        "@ai-sdk/provider-utils": "^2.0.0",
+        "partial-json": "0.1.7"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      },
+      "peerDependenciesMeta": {
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/on-finished": {
       "version": "2.4.1",
       "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
@@ -13388,6 +13723,13 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/partial-json": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz",
+      "integrity": "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/path-browserify": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz",
@@ -14402,6 +14744,13 @@
         "loose-envify": "^1.1.0"
       }
     },
+    "node_modules/secure-json-parse": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
+      "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
     "node_modules/seek-bzip": {
       "version": "1.0.6",
       "resolved": "https://registry.npmjs.org/seek-bzip/-/seek-bzip-1.0.6.tgz",
@@ -14830,6 +15179,22 @@
         "simple-concat": "^1.0.0"
       }
     },
+    "node_modules/simple-git": {
+      "version": "3.28.0",
+      "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.28.0.tgz",
+      "integrity": "sha512-Rs/vQRwsn1ILH1oBUy8NucJlXmnnLeLCfcvbSehkPzbv3wwoFWIdtfd6Ndo6ZPhlPsCZ60CPI4rxurnwAa+a2w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@kwsites/file-exists": "^1.1.1",
+        "@kwsites/promise-deferred": "^1.1.1",
+        "debug": "^4.4.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/steveukx/git-js?sponsor=1"
+      }
+    },
     "node_modules/simple-oauth2": {
       "version": "5.1.0",
       "resolved": "https://registry.npmjs.org/simple-oauth2/-/simple-oauth2-5.1.0.tgz",
@@ -15351,6 +15716,20 @@
         "node": ">= 6"
       }
     },
+    "node_modules/swr": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.3.tgz",
+      "integrity": "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "dequal": "^2.0.3",
+        "use-sync-external-store": "^1.4.0"
+      },
+      "peerDependencies": {
+        "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
+      }
+    },
     "node_modules/synckit": {
       "version": "0.11.8",
       "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz",
@@ -15571,6 +15950,19 @@
         "node": "*"
       }
     },
+    "node_modules/throttleit": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz",
+      "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/through": {
       "version": "2.3.8",
       "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
@@ -16163,16 +16555,17 @@
       }
     },
     "node_modules/uuid": {
-      "version": "9.0.1",
-      "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
-      "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
+      "version": "11.1.0",
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz",
+      "integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==",
+      "dev": true,
       "funding": [
         "https://github.com/sponsors/broofa",
         "https://github.com/sponsors/ctavan"
       ],
       "license": "MIT",
       "bin": {
-        "uuid": "dist/bin/uuid"
+        "uuid": "dist/esm/bin/uuid"
       }
     },
     "node_modules/v8-compile-cache-lib": {
diff --git a/package.json b/package.json
index 53d6d2c6..53639aec 100644
--- a/package.json
+++ b/package.json
@@ -29,11 +29,17 @@
     "check:types": "tsc --noEmit --project tsconfig.json",
     "reformat": "prettier --write .",
     "generate": "./scripts/generate.sh",
-    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage"
+    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/",
+    "pre:test:accuracy": "npm run build:compile",
+    "test:accuracy": "sh ./scripts/run-accuracy-tests.sh"
   },
   "license": "Apache-2.0",
   "devDependencies": {
+    "@ai-sdk/anthropic": "^1.2.12",
+    "@ai-sdk/azure": "^1.3.23",
+    "@ai-sdk/openai": "^1.3.22",
     "@eslint/js": "^9.30.1",
+    "@himanshusinghs/google": "^1.2.11",
     "@jest/globals": "^30.0.4",
     "@modelcontextprotocol/inspector": "^0.16.0",
     "@redocly/cli": "^1.34.4",
@@ -41,6 +47,7 @@
     "@types/node": "^24.0.12",
     "@types/simple-oauth2": "^5.0.7",
     "@types/yargs-parser": "^21.0.3",
+    "ai": "^4.3.16",
     "eslint": "^9.30.1",
     "eslint-config-prettier": "^10.1.5",
     "eslint-plugin-jest": "^29.0.1",
@@ -49,14 +56,18 @@
     "jest": "^30.0.4",
     "jest-environment-node": "^30.0.4",
     "jest-extended": "^6.0.0",
+    "microdiff": "^1.5.0",
     "mongodb-runner": "^5.9.2",
+    "ollama-ai-provider": "^1.2.0",
     "openapi-types": "^12.1.3",
     "openapi-typescript": "^7.8.0",
     "prettier": "^3.6.2",
+    "simple-git": "^3.28.0",
     "ts-jest": "^29.4.0",
     "tsx": "^4.20.3",
     "typescript": "^5.8.3",
     "typescript-eslint": "^8.36.0",
+    "uuid": "^11.1.0",
     "yaml": "^2.8.0"
   },
   "dependencies": {
diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html
new file mode 100644
index 00000000..903457f8
--- /dev/null
+++ b/resources/test-summary-template.html
@@ -0,0 +1,407 @@
+<!doctype html>
+<html lang="en">
+    <head>
+        <meta charset="UTF-8" />
+        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+        <title>MongoDB MCP Server - Accuracy Test Summary</title>
+        <style>
+            body {
+                font-family:
+                    -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+                margin: 0;
+                padding: 20px;
+                background-color: #f5f5f5;
+                color: #333;
+            }
+            .container {
+                max-width: 1400px;
+                margin: 0 auto;
+                background: white;
+                border-radius: 8px;
+                box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
+                padding: 30px;
+            }
+            h1 {
+                color: #00684a;
+                border-bottom: 3px solid #00684a;
+                padding-bottom: 10px;
+                margin-bottom: 30px;
+            }
+            .header-info {
+                background: #f8f9fa;
+                padding: 20px;
+                border-radius: 6px;
+                margin-bottom: 20px;
+                border-left: 4px solid #00684a;
+            }
+            .header-info:nth-child(3) {
+                border-left-color: #007bff;
+            }
+            .header-info:nth-child(4) {
+                border-left-color: #28a745;
+            }
+            .header-info h2 {
+                margin-top: 0;
+                margin-bottom: 15px;
+                color: #00684a;
+                font-size: 1.2em;
+            }
+            .header-info:nth-child(3) h2 {
+                color: #007bff;
+            }
+            .header-info:nth-child(4) h2 {
+                color: #28a745;
+            }
+            .info-grid {
+                display: grid;
+                grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+                gap: 15px;
+                margin-top: 15px;
+            }
+            .info-item {
+                background: white;
+                padding: 15px;
+                border-radius: 4px;
+                border: 1px solid #dee2e6;
+            }
+            .info-label {
+                font-weight: bold;
+                color: #00684a;
+                margin-bottom: 5px;
+            }
+            .info-value {
+                color: #666;
+                word-break: break-all;
+            }
+            .summary {
+                background: #f8f9fa;
+                padding: 20px;
+                border-radius: 6px;
+                margin-bottom: 30px;
+                border-left: 4px solid #007bff;
+            }
+            .summary h2 {
+                margin-top: 0;
+                color: #007bff;
+            }
+            .stat-grid {
+                display: grid;
+                grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+                gap: 15px;
+                margin-top: 15px;
+            }
+            .stat-item {
+                background: white;
+                padding: 15px;
+                border-radius: 4px;
+                border: 1px solid #dee2e6;
+            }
+            .stat-value {
+                font-size: 1.5em;
+                font-weight: bold;
+                color: #007bff;
+            }
+            .stat-label {
+                font-size: 0.9em;
+                color: #666;
+                margin-top: 5px;
+            }
+            table {
+                width: 100%;
+                border-collapse: collapse;
+                margin-top: 20px;
+                font-size: 14px;
+            }
+            th,
+            td {
+                padding: 12px 8px;
+                text-align: left;
+                border-bottom: 1px solid #dee2e6;
+                vertical-align: top;
+            }
+            th {
+                background-color: #00684a;
+                color: white;
+                font-weight: 600;
+                position: sticky;
+                top: 0;
+                z-index: 10;
+            }
+            .test-row {
+                cursor: pointer;
+                transition: background-color 0.2s;
+            }
+            .test-row:hover {
+                background-color: #f8f9fa;
+            }
+            .expanded-row {
+                background-color: #f8f9fa;
+            }
+            .details-row {
+                display: none;
+                background-color: #ffffff;
+                border-left: 4px solid #00684a;
+            }
+            .details-row.visible {
+                display: table-row;
+            }
+            .details-content {
+                padding: 20px;
+                background: #f8f9fa;
+                border-radius: 6px;
+                margin: 10px 0;
+            }
+            .conversation-section {
+                margin-bottom: 20px;
+            }
+            .conversation-section h4 {
+                color: #00684a;
+                margin-bottom: 10px;
+            }
+            .conversation-content {
+                background: white;
+                padding: 15px;
+                border-radius: 4px;
+                border: 1px solid #dee2e6;
+                white-space: pre-wrap;
+                font-family: "Monaco", "Menlo", monospace;
+                font-size: 12px;
+                max-height: 400px;
+                overflow-y: auto;
+            }
+            .accuracy-perfect {
+                background-color: #d4edda;
+                color: #155724;
+                padding: 2px 6px;
+                border-radius: 3px;
+                font-weight: bold;
+            }
+            .accuracy-good {
+                background-color: #fff3cd;
+                color: #856404;
+                padding: 2px 6px;
+                border-radius: 3px;
+                font-weight: bold;
+            }
+            .accuracy-poor {
+                background-color: #f8d7da;
+                color: #721c24;
+                padding: 2px 6px;
+                border-radius: 3px;
+                font-weight: bold;
+            }
+            .tool-call {
+                background: #e9ecef;
+                padding: 2px 6px;
+                border-radius: 3px;
+                margin: 0 2px 2px 0;
+                cursor: help;
+                display: inline-block;
+                word-break: break-word;
+            }
+            .tokens-usage {
+                background: #e3f2fd;
+                padding: 2px 6px;
+                border-radius: 3px;
+                cursor: help;
+            }
+            .prompt-cell {
+                width: 35%;
+                min-width: 350px;
+                word-wrap: break-word;
+                font-family: "Monaco", "Menlo", monospace;
+                font-size: 12px;
+                background-color: #f8f9fa;
+            }
+            .model-cell {
+                width: 15%;
+                min-width: 180px;
+                word-wrap: break-word;
+            }
+            .tool-calls-cell {
+                width: 12%;
+                min-width: 120px;
+                word-wrap: break-word;
+                white-space: normal;
+            }
+            .accuracy-cell {
+                width: 8%;
+                min-width: 80px;
+                text-align: center;
+            }
+            .baseline-accuracy-cell {
+                width: 8%;
+                min-width: 80px;
+                text-align: center;
+            }
+            .accuracy-comparison {
+                background: #e9ecef;
+                padding: 2px 6px;
+                border-radius: 3px;
+                font-weight: bold;
+            }
+            .accuracy-improved {
+                background: #d4edda;
+                color: #155724;
+            }
+            .accuracy-regressed {
+                background: #f8d7da;
+                color: #721c24;
+            }
+            .accuracy-same {
+                background: #e2e3e5;
+                color: #495057;
+            }
+            .response-time-cell {
+                width: 10%;
+                min-width: 100px;
+                text-align: center;
+            }
+            .tokens-cell {
+                width: 10%;
+                min-width: 100px;
+                text-align: center;
+            }
+            .expand-indicator {
+                margin-right: 8px;
+                font-weight: bold;
+                color: #00684a;
+            }
+            .status-done {
+                color: #28a745;
+                font-weight: bold;
+            }
+            .status-failed {
+                color: #dc3545;
+                font-weight: bold;
+            }
+            .status-in-progress {
+                color: #ffc107;
+                font-weight: bold;
+            }
+            @media (max-width: 768px) {
+                .container {
+                    padding: 15px;
+                }
+                table {
+                    font-size: 12px;
+                }
+                th,
+                td {
+                    padding: 8px 4px;
+                }
+                .info-grid,
+                .stat-grid {
+                    grid-template-columns: 1fr;
+                }
+            }
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <h1>📊 MongoDB MCP Server - Accuracy Test Summary</h1>
+            <div class="header-info">
+                <h2>📊 Current Run Information</h2>
+                <div class="info-grid">
+                    <div class="info-item">
+                        <div class="info-label">Accuracy Run ID</div>
+                        <div class="info-value">{{accuracyRunId}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Commit SHA</div>
+                        <div class="info-value">{{commitSHA}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Run Created On</div>
+                        <div class="info-value">{{createdOn}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Report Generated On</div>
+                        <div class="info-value">{{reportGeneratedOn}}</div>
+                    </div>
+                </div>
+            </div>
+
+            <div class="header-info">
+                <h2>📈 Test Results Summary</h2>
+                <div class="info-grid">
+                    <div class="info-item">
+                        <div class="info-label">Total Prompts Evaluated</div>
+                        <div class="info-value">{{totalTests}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Models Tested</div>
+                        <div class="info-value">{{modelsCount}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Evals with 0% Accuracy</div>
+                        <div class="info-value">{{testsWithZeroAccuracy}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Average Accuracy</div>
+                        <div class="info-value">{{averageAccuracy}}</div>
+                    </div>
+                </div>
+            </div>
+
+            <div class="header-info">
+                <h2>🔄 Baseline Comparison</h2>
+                <div class="info-grid">
+                    <div class="info-item">
+                        <div class="info-label">Baseline Accuracy Run ID</div>
+                        <div class="info-value">{{baselineAccuracyRunId}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Baseline Commit SHA</div>
+                        <div class="info-value">{{baselineCommitSHA}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Baseline Run Created On</div>
+                        <div class="info-value">{{baselineCreatedOn}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Evals Improved vs Baseline</div>
+                        <div class="info-value">{{evalsImproved}}</div>
+                    </div>
+                    <div class="info-item">
+                        <div class="info-label">Evals Regressed vs Baseline</div>
+                        <div class="info-value">{{evalsRegressed}}</div>
+                    </div>
+                </div>
+            </div>
+            <table>
+                <thead>
+                    <tr>
+                        <th>Prompt</th>
+                        <th>Model</th>
+                        <th>Expected Tool Calls</th>
+                        <th>LLM Tool Calls</th>
+                        <th>Accuracy</th>
+                        <th>Baseline Accuracy</th>
+                        <th>LLM Response Time (ms)</th>
+                        <th>Total Tokens Used</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {{tableRows}}
+                </tbody>
+            </table>
+        </div>
+        <script>
+            function toggleDetails(index) {
+                const detailsRow = document.getElementById("details-" + index);
+                const indicator = document.getElementById("indicator-" + index);
+                const testRow = detailsRow.previousElementSibling;
+                if (detailsRow.classList.contains("visible")) {
+                    detailsRow.classList.remove("visible");
+                    indicator.textContent = "▶";
+                    testRow.classList.remove("expanded-row");
+                } else {
+                    detailsRow.classList.add("visible");
+                    indicator.textContent = "▼";
+                    testRow.classList.add("expanded-row");
+                }
+            }
+        </script>
+    </body>
+</html>
diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts
new file mode 100644
index 00000000..fba40610
--- /dev/null
+++ b/scripts/generate-test-summary.ts
@@ -0,0 +1,288 @@
+import { readFile, writeFile } from "fs/promises";
+import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js";
+import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js";
+import type {
+    AccuracySnapshotEntry,
+    ExpectedToolCall,
+    LLMToolCall,
+} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+interface BaselineComparison {
+    baselineAccuracy?: number;
+    comparisonResult?: "improved" | "regressed" | "same";
+}
+
+interface SnapshotEntryWithBaseline extends AccuracySnapshotEntry {
+    baseline?: BaselineComparison;
+}
+
+function populateTemplate(template: string, data: Record<string, string>): string {
+    return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? "");
+}
+
+function formatAccuracy(accuracy: number): string {
+    return (accuracy * 100).toFixed(1) + "%";
+}
+
+function getAccuracyClass(accuracy: number): string {
+    if (accuracy === 1) return "accuracy-perfect";
+    if (accuracy >= 0.75) return "accuracy-good";
+    return "accuracy-poor";
+}
+
+function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string {
+    return toolCalls
+        .map((call) => {
+            const params = JSON.stringify(call.parameters, null, 2);
+            return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
+        })
+        .join(", ");
+}
+
+function formatTokenUsage(tokensUsage: {
+    promptTokens?: number;
+    completionTokens?: number;
+    totalTokens?: number;
+}): string {
+    const total = tokensUsage.totalTokens || 0;
+    const prompt = tokensUsage.promptTokens || 0;
+    const completion = tokensUsage.completionTokens || 0;
+
+    const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`;
+    return `<span class="tokens-usage" title="${tooltip}">${total}</span>`;
+}
+
+function formatMessages(messages: Array<Record<string, unknown>>): string {
+    return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n");
+}
+
+function formatBaselineAccuracy(snapshot: SnapshotEntryWithBaseline): string {
+    if (!snapshot.baseline || snapshot.baseline.baselineAccuracy === undefined) {
+        return '<span class="accuracy-comparison">N/A</span>';
+    }
+
+    const baselineAccuracyText = formatAccuracy(snapshot.baseline.baselineAccuracy);
+    let comparisonClass = "accuracy-comparison";
+    let comparisonIcon = "";
+
+    if (snapshot.baseline.comparisonResult) {
+        switch (snapshot.baseline.comparisonResult) {
+            case "improved":
+                comparisonClass += " accuracy-improved";
+                comparisonIcon = " ↗";
+                break;
+            case "regressed":
+                comparisonClass += " accuracy-regressed";
+                comparisonIcon = " ↘";
+                break;
+            case "same":
+                comparisonClass += " accuracy-same";
+                comparisonIcon = " →";
+                break;
+        }
+    }
+
+    return `<span class="${comparisonClass}">${baselineAccuracyText}${comparisonIcon}</span>`;
+}
+
+function compareSnapshotEntries(
+    currentSnapshotEntries: AccuracySnapshotEntry[],
+    baselineSnapshotEntries: AccuracySnapshotEntry[]
+): SnapshotEntryWithBaseline[] {
+    const baselineMap = new Map<string, AccuracySnapshotEntry>();
+    baselineSnapshotEntries.forEach((entry) => {
+        const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`;
+        baselineMap.set(key, entry);
+    });
+
+    return currentSnapshotEntries.map((entry) => {
+        const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`;
+        const baselineEntry = baselineMap.get(key);
+
+        if (!baselineEntry) {
+            return entry;
+        }
+
+        let comparisonResult: "improved" | "regressed" | "same";
+        if (entry.toolCallingAccuracy > baselineEntry.toolCallingAccuracy) {
+            comparisonResult = "improved";
+        } else if (entry.toolCallingAccuracy < baselineEntry.toolCallingAccuracy) {
+            comparisonResult = "regressed";
+        } else {
+            comparisonResult = "same";
+        }
+
+        return {
+            ...entry,
+            baseline: {
+                baselineAccuracy: baselineEntry.toolCallingAccuracy,
+                comparisonResult,
+            },
+        };
+    });
+}
+
+async function generateHtmlReport(
+    snapshotEntries: SnapshotEntryWithBaseline[],
+    accuracyRunId: string,
+    baselineInfo?: {
+        commitSHA: string;
+        accuracyRunId: string;
+        createdOn: string;
+    }
+): Promise<string> {
+    const totalPrompts = snapshotEntries.length;
+    const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size;
+    const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0);
+
+    const totalAccuracy = snapshotEntries.reduce((sum, entry) => sum + entry.toolCallingAccuracy, 0);
+    const averageAccuracy = totalPrompts > 0 ? totalAccuracy / totalPrompts : 0;
+
+    const evalsImproved = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "improved").length;
+    const evalsRegressed = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "regressed").length;
+
+    const firstSnapshotEntry = snapshotEntries[0];
+    const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown";
+    const commitSHA = firstSnapshotEntry?.commitSHA || "unknown";
+    const createdOn = firstSnapshotEntry?.createdOn
+        ? new Date(firstSnapshotEntry.createdOn).toLocaleString()
+        : "unknown";
+    const reportGeneratedOn = new Date().toLocaleString();
+
+    const tableRows = snapshotEntries
+        .map(
+            (snapshotEntry, index) => `
+                <tr class="test-row" onclick="toggleDetails(${index})">
+                    <td class="prompt-cell">
+                        <span class="expand-indicator" id="indicator-${index}">▶</span>
+                        ${snapshotEntry.prompt}
+                    </td>
+                    <td class="model-cell">${snapshotEntry.provider} - ${snapshotEntry.requestedModel}</td>
+                    <td class="tool-calls-cell">${formatToolCallsWithTooltip(snapshotEntry.expectedToolCalls)}</td>
+                    <td class="tool-calls-cell">${formatToolCallsWithTooltip(snapshotEntry.actualToolCalls)}</td>
+                    <td class="accuracy-cell">
+                        <span class="${getAccuracyClass(snapshotEntry.toolCallingAccuracy)}">
+                            ${formatAccuracy(snapshotEntry.toolCallingAccuracy)}
+                        </span>
+                    </td>
+                    <td class="baseline-accuracy-cell">${formatBaselineAccuracy(snapshotEntry)}</td>
+                    <td class="response-time-cell">${snapshotEntry.llmResponseTime.toFixed(2)}</td>
+                    <td class="tokens-cell">${formatTokenUsage(snapshotEntry.tokensUsage || {})}</td>
+                </tr>
+                <tr class="details-row" id="details-${index}">
+                    <td colspan="8">
+                        <div class="details-content">
+                            <div class="conversation-section">
+                                <h4>🤖 LLM Response</h4>
+                                <div class="conversation-content">${snapshotEntry.text}</div>
+                            </div>
+                            <div class="conversation-section">
+                                <h4>💬 Conversation Messages</h4>
+                                <div class="conversation-content">${formatMessages(snapshotEntry.messages)}</div>
+                            </div>
+                        </div>
+                    </td>
+                </tr>
+            `
+        )
+        .join("");
+
+    const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8");
+    return populateTemplate(template, {
+        accuracyRunId,
+        runStatus,
+        runStatusUpper: runStatus.toUpperCase(),
+        commitSHA,
+        reportGeneratedOn,
+        createdOn,
+        totalTests: String(totalPrompts),
+        modelsCount: String(modelsCount),
+        testsWithZeroAccuracy: String(testsWithZeroAccuracy.length),
+        averageAccuracy: formatAccuracy(averageAccuracy),
+        baselineCommitSHA: baselineInfo?.commitSHA || "N/A",
+        baselineAccuracyRunId: baselineInfo?.accuracyRunId || "N/A",
+        baselineCreatedOn: baselineInfo?.createdOn || "N/A",
+        evalsImproved: String(evalsImproved),
+        evalsRegressed: String(evalsRegressed),
+        tableRows,
+    });
+}
+
+async function generateTestSummary(): Promise<void> {
+    try {
+        const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID;
+        const baselineCommitSHA = process.env.MDB_ACCURACY_BASELINE_COMMIT;
+
+        if (!accuracyRunId) {
+            throw new Error("Cannot generate test summary, accuracy run id is unknown");
+        }
+        console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`);
+
+        const storage = await getAccuracySnapshotStorage();
+        const currentSnapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId);
+
+        if (currentSnapshot.length === 0) {
+            console.log("No snapshot entries found for the current run.");
+            await storage.close();
+            return;
+        }
+
+        let snapshotWithBaseline: SnapshotEntryWithBaseline[] = currentSnapshot;
+        let baselineInfo: { commitSHA: string; accuracyRunId: string; createdOn: string } | undefined;
+
+        if (baselineCommitSHA) {
+            console.log(`🔍 Fetching baseline snapshot entries for commit: ${baselineCommitSHA}`);
+            const baselineSnapshot = await storage.getLatestSnapshotForCommit(baselineCommitSHA);
+
+            if (baselineSnapshot.length > 0) {
+                console.log(`✅ Found ${baselineSnapshot.length} baseline snapshot entries.`);
+                snapshotWithBaseline = compareSnapshotEntries(currentSnapshot, baselineSnapshot);
+
+                const firstBaselineSnapshot = baselineSnapshot[0];
+                if (firstBaselineSnapshot) {
+                    baselineInfo = {
+                        commitSHA: firstBaselineSnapshot.commitSHA,
+                        accuracyRunId: firstBaselineSnapshot.accuracyRunId,
+                        createdOn: firstBaselineSnapshot.createdOn
+                            ? new Date(firstBaselineSnapshot.createdOn).toLocaleString()
+                            : "unknown",
+                    };
+                }
+            } else {
+                console.log(`⚠️  No baseline snapshots found for commit: ${baselineCommitSHA}`);
+            }
+        }
+
+        const htmlReport = await generateHtmlReport(snapshotWithBaseline, accuracyRunId, baselineInfo);
+        await storage.close();
+
+        const reportPath = HTML_TESTS_SUMMARY_FILE;
+        await writeFile(reportPath, htmlReport, "utf8");
+
+        console.log(`✅ HTML report generated: ${reportPath}`);
+
+        const totalPrompts = snapshotWithBaseline.length;
+        const modelsCount = new Set(snapshotWithBaseline.map((s) => `${s.provider} ${s.requestedModel}`)).size;
+        const testsWithZeroAccuracy = snapshotWithBaseline.filter(
+            (snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0
+        );
+        const evalsImproved = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "improved").length;
+        const evalsRegressed = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "regressed").length;
+
+        console.log(`\n📈 Summary:`);
+        console.log(`   Total prompts evaluated: ${totalPrompts}`);
+        console.log(`   Models tested: ${modelsCount}`);
+        console.log(`   Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`);
+
+        if (baselineCommitSHA) {
+            console.log(`   Baseline commit: ${baselineCommitSHA}`);
+            console.log(`   Evals improved vs baseline: ${evalsImproved}`);
+            console.log(`   Evals regressed vs baseline: ${evalsRegressed}`);
+        }
+    } catch (error) {
+        console.error("Error generating test summary:", error);
+        process.exit(1);
+    }
+}
+
+void generateTestSummary();
diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh
new file mode 100644
index 00000000..ae02dd06
--- /dev/null
+++ b/scripts/run-accuracy-tests.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Variables necessary for the accuracy test runs
+export MDB_ACCURACY_RUN_ID=$(npx uuid v4)
+
+# For providing access tokens for different LLM providers
+# export MDB_OPEN_AI_API_KEY=""
+# export MDB_GEMINI_API_KEY=""
+# export MDB_AZURE_OPEN_AI_API_KEY=""
+# export MDB_AZURE_OPEN_AI_API_URL=""
+
+# For providing a mongodb based storage to store accuracy snapshots
+# export MDB_ACCURACY_MDB_URL=""
+# export MDB_ACCURACY_MDB_DB=""
+# export MDB_ACCURACY_MDB_COLLECTION=""
+
+# By default we run all the tests under tests/accuracy folder unless a path is
+# specified in the command line. Such as:
+# npm run test:accuracy -- tests/accuracy/some-test.test.ts
+TEST_PATH_PATTERN="${1:-tests/accuracy}"
+shift || true
+node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@"
+
+# Preserving the exit code from test run to correctly notify in the CI
+# environments when the tests fail.
+JEST_EXIT_CODE=$?
+
+# Each test run submits an accuracy snapshot entry with the accuracyRunStatus:
+# "in-progress". When all the tests are done and jest exits with an exit code of
+# 0, we can safely mark accuracy run as finished otherwise failed.
+
+# This "outside-the-tests-status-update" is arising out of the fact that each
+# test suite stores their own accuracy run data in the storage and this setup
+# might lead to data inconsistency when the tests fail. To overcome that each
+# accuracy snapshot entry has a status which by default is "in-progress" and is
+# updated when the tests either pass (all our accuracy tests are supposed to
+# pass unless some errors occurs during the test runs), or fail.
+
+# This is necessary when comparing one accuracy run with another as we wouldn't
+# want to compare against an incomplete run.
+if [ $JEST_EXIT_CODE -eq 0 ]; then
+  MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'"
+  npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report"
+else
+  MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'"
+fi
+
+
+exit $JEST_EXIT_CODE
\ No newline at end of file
diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts
new file mode 100644
index 00000000..6d8e3895
--- /dev/null
+++ b/scripts/update-accuracy-run-status.ts
@@ -0,0 +1,18 @@
+import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js";
+import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID;
+const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS;
+
+if (
+    !envAccuracyRunId ||
+    (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed)
+) {
+    process.exit(1);
+}
+
+console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`);
+const storage = await getAccuracySnapshotStorage();
+await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus);
+await storage.close();
+console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`);
diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts
new file mode 100644
index 00000000..30a5a0e3
--- /dev/null
+++ b/tests/accuracy/aggregate.test.ts
@@ -0,0 +1,16 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+
+describeAccuracyTests(getAvailableModels(), [
+    {
+        prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
+        expectedToolCalls: [
+            {
+                toolName: "aggregate",
+                parameters: {
+                    pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } },
+                },
+            },
+        ],
+    },
+]);
diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts
new file mode 100644
index 00000000..dab7d317
--- /dev/null
+++ b/tests/accuracy/collection-indexes.test.ts
@@ -0,0 +1,26 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsCollectionIndexes(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"),
+    callsCollectionIndexes("List all the indexes in movies collection in mflix database"),
+    callsCollectionIndexes(
+        `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?`
+    ),
+]);
diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts
new file mode 100644
index 00000000..f2f22a88
--- /dev/null
+++ b/tests/accuracy/collection-schema.test.ts
@@ -0,0 +1,23 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsCollectionSchema(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "collection-schema",
+                parameters: {
+                    database: "db1",
+                    collection: "coll1",
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"),
+    callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"),
+]);
diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts
new file mode 100644
index 00000000..2bd2f021
--- /dev/null
+++ b/tests/accuracy/collection-storage-size.test.ts
@@ -0,0 +1,42 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+
+describeAccuracyTests(getAvailableModels(), [
+    {
+        prompt: "What is the size of 'mflix.movies' namespace",
+        expectedToolCalls: [
+            {
+                toolName: "collection-storage-size",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+        ],
+    },
+    {
+        prompt: "How much size is each collection in comics database",
+        expectedToolCalls: [
+            {
+                toolName: "list-collections",
+                parameters: {
+                    database: "comics",
+                },
+            },
+            {
+                toolName: "collection-storage-size",
+                parameters: {
+                    database: "comics",
+                    collection: "books",
+                },
+            },
+            {
+                toolName: "collection-storage-size",
+                parameters: {
+                    database: "comics",
+                    collection: "characters",
+                },
+            },
+        ],
+    },
+]);
diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts
new file mode 100644
index 00000000..09db4678
--- /dev/null
+++ b/tests/accuracy/count.test.ts
@@ -0,0 +1,54 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "count",
+                parameters: {
+                    database,
+                    collection,
+                },
+            },
+        ],
+    };
+}
+
+function callsCountToolWithQuery(
+    prompt: string,
+    database = "mflix",
+    collection = "movies",
+    query: Record<string, unknown> = {}
+): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "count",
+                parameters: {
+                    database,
+                    collection,
+                    query,
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."),
+    callsCountToolWithEmptyQuery(
+        "How many documents are there in 'characters' collection in 'comics' database?",
+        "comics",
+        "characters"
+    ),
+    callsCountToolWithQuery(
+        "Count all the documents in 'mflix.movies' namespace with runtime less than 100?",
+        "mflix",
+        "movies",
+        { runtime: { $lt: 100 } }
+    ),
+]);
diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts
new file mode 100644
index 00000000..db7f888c
--- /dev/null
+++ b/tests/accuracy/create-collection.test.ts
@@ -0,0 +1,51 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "create-collection",
+                parameters: {
+                    database,
+                    collection,
+                },
+            },
+        ],
+    };
+}
+
+function callsCreateCollectionWithListCollections(prompt: string, expectedToolCalls: ExpectedToolCall[]) {
+    return {
+        injectConnectedAssumption: true,
+        prompt: prompt,
+        mockedTools: {},
+        expectedToolCalls,
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"),
+    callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"),
+    callsCreateCollectionWithListCollections(
+        "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
+        [
+            {
+                toolName: "list-collections",
+                parameters: {
+                    database: "mflix",
+                },
+            },
+            {
+                toolName: "create-collection",
+                parameters: {
+                    database: "mflix",
+                    collection: "documentaries",
+                },
+            },
+        ]
+    ),
+]);
diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts
new file mode 100644
index 00000000..6dae12e5
--- /dev/null
+++ b/tests/accuracy/create-index.test.ts
@@ -0,0 +1,31 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsCreateIndex(prompt: string, indexKeys: Record<string, unknown>): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "create-index",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    keys: indexKeys,
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsCreateIndex(
+        "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }",
+        {
+            release_year: 1,
+        }
+    ),
+    callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", {
+        title: "text",
+    }),
+]);
diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts
new file mode 100644
index 00000000..656eccc2
--- /dev/null
+++ b/tests/accuracy/db-stats.test.ts
@@ -0,0 +1,19 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "db-stats",
+                parameters: {
+                    database,
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]);
diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts
new file mode 100644
index 00000000..c0dd4d51
--- /dev/null
+++ b/tests/accuracy/delete-many.test.ts
@@ -0,0 +1,40 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "delete-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+        ],
+    };
+}
+
+function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "delete-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    filter: { runtime: { $lt: 100 } },
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"),
+    callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"),
+    callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"),
+]);
diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts
new file mode 100644
index 00000000..98ba3348
--- /dev/null
+++ b/tests/accuracy/drop-collection.test.ts
@@ -0,0 +1,74 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+function onlyCallsDropCollection(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "drop-collection",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+        ],
+    };
+}
+
+function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls,
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."),
+    onlyCallsDropCollection("Drop movies collection from mflix database."),
+    callsDropCollection("Remove books collection from which ever database contains it.", [
+        {
+            toolName: "list-databases",
+            parameters: {},
+        },
+        {
+            toolName: "list-collections",
+            parameters: {
+                database: "admin",
+            },
+        },
+        {
+            toolName: "list-collections",
+            parameters: {
+                database: "comics",
+            },
+        },
+        {
+            toolName: "list-collections",
+            parameters: {
+                database: "config",
+            },
+        },
+        {
+            toolName: "list-collections",
+            parameters: {
+                database: "local",
+            },
+        },
+        {
+            toolName: "list-collections",
+            parameters: {
+                database: "mflix",
+            },
+        },
+        {
+            toolName: "drop-collection",
+            parameters: {
+                database: "comics",
+                collection: "books",
+            },
+        },
+    ]),
+]);
diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts
new file mode 100644
index 00000000..53fc7fd5
--- /dev/null
+++ b/tests/accuracy/drop-database.test.ts
@@ -0,0 +1,42 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "drop-database",
+                parameters: {
+                    database: "mflix",
+                },
+            },
+        ],
+    };
+}
+
+function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls,
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    onlyCallsDropDatabase("Remove mflix database from my cluster."),
+    onlyCallsDropDatabase("Drop database named mflix."),
+    callsDropDatabase("If there is a mflix database in my cluster then drop it.", [
+        {
+            toolName: "list-databases",
+            parameters: {},
+        },
+        {
+            toolName: "drop-database",
+            parameters: {
+                database: "mflix",
+            },
+        },
+    ]),
+]);
diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts
new file mode 100644
index 00000000..4a539c48
--- /dev/null
+++ b/tests/accuracy/explain.test.ts
@@ -0,0 +1,64 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsExplain(prompt: string, method: Record<string, unknown>): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "explain",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    method: [method],
+                },
+            },
+        ],
+    };
+}
+
+const callsExplainWithFind = (prompt: string) =>
+    callsExplain(prompt, {
+        name: "find",
+        arguments: {
+            filter: { release_year: 2020 },
+        },
+    });
+
+const callsExplainWithAggregate = (prompt: string) =>
+    callsExplain(prompt, {
+        name: "aggregate",
+        arguments: {
+            pipeline: [
+                {
+                    $match: { release_year: 2020 },
+                },
+            ],
+        },
+    });
+
+const callsExplainWithCount = (prompt: string) =>
+    callsExplain(prompt, {
+        name: "count",
+        arguments: {
+            query: { release_year: 2020 },
+        },
+    });
+
+/**
+ * None of these tests score a parameter match on any of the models, likely
+ * because we are using Zod.union, when we probably should've used
+ * Zod.discriminatedUnion
+ */
+describeAccuracyTests(getAvailableModels(), [
+    callsExplainWithFind(
+        `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`
+    ),
+    callsExplainWithAggregate(
+        `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`
+    ),
+    callsExplainWithCount(
+        `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`
+    ),
+]);
diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts
new file mode 100644
index 00000000..02c02cd1
--- /dev/null
+++ b/tests/accuracy/find.test.ts
@@ -0,0 +1,117 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "find",
+                parameters: {
+                    database,
+                    collection,
+                },
+            },
+        ],
+    };
+}
+
+function callsFindWithFilter(prompt: string, filter: Record<string, unknown>): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "find",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    filter: filter,
+                },
+            },
+        ],
+    };
+}
+
+function callsFindWithProjection(prompt: string, projection: Record<string, number>): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "find",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    projection,
+                },
+            },
+        ],
+    };
+}
+
+function callsFindWithProjectionAndFilters(
+    prompt: string,
+    filter: Record<string, unknown>,
+    projection: Record<string, number>
+): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "find",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    filter,
+                    projection,
+                },
+            },
+        ],
+    };
+}
+
+function callsFindWithFilterSortAndLimit(
+    prompt: string,
+    filter: Record<string, unknown>,
+    sort: Record<string, number>,
+    limit: number
+): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "find",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    filter,
+                    sort,
+                    limit,
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsFindNoFilter("List all the movies in 'mflix.movies' namespace."),
+    callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"),
+    callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", {
+        runtime: { $lt: 100 },
+    }),
+    callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", {
+        director: "Christina Collins",
+    }),
+    callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }),
+    callsFindWithProjectionAndFilters(
+        "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'",
+        { title: "Certain Fish" },
+        { cast: 1 }
+    ),
+    callsFindWithFilterSortAndLimit(
+        "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime",
+        { genres: "Horror" },
+        { runtime: 1 },
+        2
+    ),
+]);
diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts
new file mode 100644
index 00000000..4ce15bb8
--- /dev/null
+++ b/tests/accuracy/insert-many.test.ts
@@ -0,0 +1,59 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsInsertMany(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "insert-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    documents: [
+                        {
+                            id: 1,
+                            title: "name1",
+                        },
+                        {
+                            id: 2,
+                            title: "name2",
+                        },
+                        {
+                            id: 3,
+                            title: "name3",
+                        },
+                    ],
+                },
+            },
+        ],
+    };
+}
+
+function callsEmptyInsertMany(prompt: string) {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "insert-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    documents: [{}, {}, {}],
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsInsertMany(
+        [
+            "In my namespace 'mflix.movies', insert 3 documents each with the following fields:",
+            "- id: an incremental number starting from 1",
+            "- name: a string of format 'name<id>'",
+        ].join("\n")
+    ),
+    callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"),
+]);
diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts
new file mode 100644
index 00000000..78a14f34
--- /dev/null
+++ b/tests/accuracy/list-collections.test.ts
@@ -0,0 +1,56 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsListCollections(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "list-collections",
+                parameters: { database: "mflix" },
+            },
+        ],
+    };
+}
+
+function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfig {
+    return {
+        injectConnectedAssumption: true,
+        prompt: prompt,
+        mockedTools: {},
+        expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+            },
+            {
+                toolName: "list-collections",
+                parameters: { database: "admin" },
+            },
+            {
+                toolName: "list-collections",
+                parameters: { database: "comics" },
+            },
+            {
+                toolName: "list-collections",
+                parameters: { database: "config" },
+            },
+            {
+                toolName: "list-collections",
+                parameters: { database: "local" },
+            },
+            {
+                toolName: "list-collections",
+                parameters: { database: "mflix" },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsListCollections("How many collections do I have in database mflix?"),
+    callsListCollections("List all the collections in my MongoDB database mflix."),
+    callsListCollections("Is there a shows collection in my MongoDB database mflix?"),
+    callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"),
+]);
diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts
new file mode 100644
index 00000000..97a8ce27
--- /dev/null
+++ b/tests/accuracy/list-databases.test.ts
@@ -0,0 +1,21 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsListDatabases(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsListDatabases("How many databases do I have?"),
+    callsListDatabases("List all the databases that I have in my clusters"),
+    callsListDatabases("Is there a mflix database in my cluster?"),
+]);
diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts
new file mode 100644
index 00000000..8b9d2193
--- /dev/null
+++ b/tests/accuracy/logs.test.ts
@@ -0,0 +1,27 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [toolCall],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsLogsTool("Were there any startup warnings for my MongoDB server?", {
+        toolName: "mongodb-logs",
+        parameters: {
+            type: "startupWarnings",
+        },
+    }),
+    callsLogsTool("Retrieve first 10 logs for my MongoDB server?", {
+        toolName: "mongodb-logs",
+        parameters: {
+            type: "global",
+            limit: 10,
+        },
+    }),
+]);
diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts
new file mode 100644
index 00000000..549a02b9
--- /dev/null
+++ b/tests/accuracy/rename-collection.test.ts
@@ -0,0 +1,43 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsRenameCollection(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "rename-collection",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    newName: "new_movies",
+                },
+            },
+        ],
+    };
+}
+
+function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "rename-collection",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    newName: "new_movies",
+                    dropTarget: true,
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"),
+    callsRenameCollectionWithDropTarget(
+        "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace."
+    ),
+]);
diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracy-scorer.ts
new file mode 100644
index 00000000..2ae13e6c
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-scorer.ts
@@ -0,0 +1,114 @@
+import diff from "microdiff";
+import { ExpectedToolCall, LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
+
+/**
+ * Tool calling accuracy is a single number calculated based on two dimensions.
+ * 1. Did LLM call the right tool?
+ * 2. Did LLM call the tool with correct and required parameters?
+ *
+ * The number can be one of:
+ * - 0: When LLM:
+ *    - did not call the right tool
+ *    - did not call the tool with correct parameters
+ * - 0.75: When LLM:
+ *    - called the right tool but hallucinated and called some extra tools as
+ *      well or called the same tool but with different parameters
+ *    - called the right tool but hallucinated and called it with some
+ *      non-required parameters
+ * - 1: When LLM:
+ *    - called exactly the tools that were expected
+ *    - called the expected tools exactly with the expected parameters
+ *
+ * To calculate this number we must have:
+ * 1. a list of expected tool calls with their expected parameters
+ * 2. a list of LLM tool calls with their parameters
+ *
+ * For each expected tool call we find the best matching LLM tool call. Best
+ * matching LLM tool call will have:
+ * 1. the same name as that of the expected tool call
+ * 2. highest parameter similarity score, with at-least 0.75 to ensure an actual
+ *    match. And in case of competing scores, we take the first one that appears
+ *    in the LLM tool calls.
+ *
+ * Using the above logic we establish pairs between expected and actual tool
+ * calls.
+ *
+ * 1. If we could not pair some LLM tool calls with expected tool calls that
+ *    means the LLM hallucinated over the extra tool calls. For that reason we
+ *    will cap the maximum achievable accuracy to 0.75.
+ *
+ * 2. If we could not pair some expected tool calls with LLM tool calls that
+ *    means the LLM did not call one of the expected tool required to solve the
+ *    problem. For that reason we will mark the accuracy as 0 and exit early.
+ *
+ * 3. Now for each of the established tool call pairs, we will determine how
+ *    correctly the parameters were called using the parameter similarity score.
+ *    The parameter similarity score follow the same accuracy number pattern
+ *    described above:
+ *      - 0 : for missing parameters, incorrect parameter values
+ *      - 0.75 : for additional parameters
+ *      - 1 : for a perfect match
+ *
+ * The final accuracy score is then calculated as the least of:
+ * - Maximum achievable accuracy from #1
+ * - The least of parameter similarity score from the established pairs in #3
+ *
+ * For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts
+ */
+export function calculateToolCallingAccuracy(
+    expectedToolCalls: ExpectedToolCall[],
+    actualToolCalls: LLMToolCall[]
+): number {
+    if (expectedToolCalls.length === 0) {
+        return actualToolCalls.length === 0 ? 1 : 0.75;
+    }
+
+    const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1;
+
+    const individualAccuracies: number[] = [];
+    const checkedActualToolCallIndexes = new Set<number>();
+
+    for (const expectedCall of expectedToolCalls) {
+        const candidates = actualToolCalls
+            .map((call, index) => ({ call, index }))
+            .filter(
+                ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName
+            )
+            .map(({ call, index }) => ({
+                call,
+                index,
+                score: compareParams(expectedCall.parameters, call.parameters),
+            }))
+            .filter(({ score }) => score >= 0.75)
+            .sort((a, b) => b.score - a.score || a.index - b.index);
+
+        const bestMatch = candidates[0];
+        if (!bestMatch) {
+            individualAccuracies.push(0);
+        } else {
+            checkedActualToolCallIndexes.add(bestMatch.index);
+            const individualAccuracy = Math.min(bestMatch.score, maxAccuracy);
+            individualAccuracies.push(individualAccuracy);
+        }
+    }
+
+    return Math.min(...individualAccuracies);
+}
+
+function compareParams(expected: Record<string, unknown>, actual: Record<string, unknown>): number {
+    const differences = diff(expected, actual);
+
+    if (differences.length === 0) {
+        return 1;
+    }
+
+    const hasOnlyAdditions = differences.every((d) => d.type === "CREATE");
+    const hasRemovals = differences.some((d) => d.type === "REMOVE");
+    const hasChanges = differences.some((d) => d.type === "CHANGE");
+
+    if (hasOnlyAdditions && !hasRemovals && !hasChanges) {
+        return 0.75;
+    }
+
+    return 0;
+}
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts
new file mode 100644
index 00000000..a919e8f0
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts
@@ -0,0 +1,117 @@
+import fs from "fs/promises";
+import {
+    AccuracyRunStatus,
+    AccuracyRunStatuses,
+    AccuracySnapshotEntry,
+    AccuracySnapshotEntrySchema,
+    AccuracySnapshotStorage,
+} from "./snapshot-storage.js";
+import { GENERATED_ASSETS_DIR, LOCAL_SNAPSHOTS_FILE } from "../constants.js";
+
+export class DiskSnapshotStorage implements AccuracySnapshotStorage {
+    async createSnapshotEntry(
+        snapshotEntry: Pick<
+            AccuracySnapshotEntry,
+            | "accuracyRunId"
+            | "commitSHA"
+            | "provider"
+            | "requestedModel"
+            | "prompt"
+            | "toolCallingAccuracy"
+            | "expectedToolCalls"
+            | "actualToolCalls"
+            | "llmResponseTime"
+            | "tokensUsage"
+            | "respondingModel"
+            | "text"
+            | "messages"
+        >
+    ): Promise<void> {
+        const snapshotWithMeta: AccuracySnapshotEntry = {
+            ...snapshotEntry,
+            accuracyRunStatus: AccuracyRunStatus.InProgress,
+            createdOn: Date.now(),
+        };
+
+        await this.appendAccuracySnapshot(snapshotWithMeta);
+    }
+
+    async getLatestSnapshotForCommit(commit: string): Promise<AccuracySnapshotEntry[]> {
+        const snapshot = await this.readSnapshot();
+        const entries = snapshot
+            .filter((entry) => {
+                return entry.commitSHA === commit && entry.accuracyRunStatus === AccuracyRunStatus.Done;
+            })
+            .sort((a, b) => b.createdOn - a.createdOn);
+        const latestRunId = entries[0]?.accuracyRunId;
+        return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : [];
+    }
+
+    async getSnapshotForAccuracyRun(accuracyRunId: string): Promise<AccuracySnapshotEntry[]> {
+        const snapshot = await this.readSnapshot();
+        return snapshot.filter((entry) => entry.accuracyRunId === accuracyRunId);
+    }
+
+    async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) {
+        const snapshot = await this.readSnapshot();
+        const updatedSnapshot = snapshot.map((entry) => {
+            if (entry.accuracyRunId === accuracyRunId) {
+                return {
+                    ...entry,
+                    accuracyRunStatus: status,
+                };
+            }
+
+            return entry;
+        });
+        await this.writeSnapshot(updatedSnapshot);
+    }
+
+    close(): Promise<void> {
+        return Promise.resolve();
+    }
+
+    private async appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise<void> {
+        for (let attempt = 0; attempt < 5; attempt++) {
+            try {
+                const snapshot = await this.readSnapshot();
+                snapshot.unshift(entry);
+                await this.writeSnapshot(snapshot);
+                return;
+            } catch (e) {
+                if (attempt < 4) {
+                    await this.waitFor(100 + Math.random() * 200);
+                } else {
+                    throw e;
+                }
+            }
+        }
+    }
+
+    private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise<void> {
+        const tmp = `${LOCAL_SNAPSHOTS_FILE}~${Date.now()}`;
+        await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2));
+        await fs.rename(tmp, LOCAL_SNAPSHOTS_FILE);
+    }
+
+    private async readSnapshot(): Promise<AccuracySnapshotEntry[]> {
+        try {
+            const raw = await fs.readFile(LOCAL_SNAPSHOTS_FILE, "utf8");
+            return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw));
+        } catch (e: unknown) {
+            if ((e as { code: string }).code === "ENOENT") {
+                return [];
+            }
+            throw e;
+        }
+    }
+
+    private waitFor(ms: number) {
+        return new Promise((resolve) => setTimeout(resolve, ms));
+    }
+
+    static async getStorage() {
+        await fs.mkdir(GENERATED_ASSETS_DIR, { recursive: true });
+        return new DiskSnapshotStorage();
+    }
+}
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts
new file mode 100644
index 00000000..da67aa60
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts
@@ -0,0 +1,7 @@
+import { DiskSnapshotStorage } from "./disk-snapshot-storage.js";
+import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js";
+import { AccuracySnapshotStorage } from "./snapshot-storage.js";
+
+export async function getAccuracySnapshotStorage(): Promise<AccuracySnapshotStorage> {
+    return MongoDBSnapshotStorage.getStorage() ?? (await DiskSnapshotStorage.getStorage());
+}
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts
new file mode 100644
index 00000000..960daffc
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts
@@ -0,0 +1,96 @@
+import { Collection, MongoClient } from "mongodb";
+import {
+    AccuracyRunStatus,
+    AccuracyRunStatuses,
+    AccuracySnapshotEntry,
+    AccuracySnapshotEntrySchema,
+    AccuracySnapshotStorage,
+} from "./snapshot-storage.js";
+
+export class MongoDBSnapshotStorage implements AccuracySnapshotStorage {
+    private readonly client: MongoClient;
+    private readonly snapshotCollection: Collection;
+    private constructor({
+        mongodbUrl,
+        database,
+        collection,
+    }: {
+        mongodbUrl: string;
+        database: string;
+        collection: string;
+    }) {
+        this.client = new MongoClient(mongodbUrl);
+        this.snapshotCollection = this.client.db(database).collection(collection);
+    }
+
+    async createSnapshotEntry(
+        snapshotEntry: Pick<
+            AccuracySnapshotEntry,
+            | "accuracyRunId"
+            | "commitSHA"
+            | "provider"
+            | "requestedModel"
+            | "prompt"
+            | "toolCallingAccuracy"
+            | "expectedToolCalls"
+            | "actualToolCalls"
+            | "llmResponseTime"
+            | "tokensUsage"
+            | "respondingModel"
+            | "text"
+            | "messages"
+        >
+    ): Promise<void> {
+        const snapshotWithMeta: AccuracySnapshotEntry = {
+            ...snapshotEntry,
+            accuracyRunStatus: AccuracyRunStatus.InProgress,
+            createdOn: Date.now(),
+        };
+        await this.snapshotCollection.insertOne(snapshotWithMeta);
+    }
+
+    async getLatestSnapshotForCommit(commit: string): Promise<AccuracySnapshotEntry[]> {
+        const latestRunId = await this.getLatestAccuracyRunForCommit(commit);
+        return latestRunId ? this.getSnapshotForAccuracyRun(latestRunId) : [];
+    }
+
+    async getSnapshotForAccuracyRun(accuracyRunId: string): Promise<AccuracySnapshotEntry[]> {
+        const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray();
+        return AccuracySnapshotEntrySchema.array().parse(snapshotEntries);
+    }
+
+    private async getLatestAccuracyRunForCommit(commit: string): Promise<string | undefined> {
+        const document = await this.snapshotCollection.findOne(
+            { commitSHA: commit, accuracyRunStatus: AccuracyRunStatus.Done },
+            { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } }
+        );
+
+        return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined;
+    }
+
+    async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) {
+        await this.snapshotCollection.updateMany(
+            { accuracyRunId: accuracyRunId },
+            { $set: { accuracyRunStatus: status } }
+        );
+    }
+
+    async close(): Promise<void> {
+        await this.client.close();
+    }
+
+    static getStorage(): MongoDBSnapshotStorage | null {
+        const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL;
+        const database = process.env.MDB_ACCURACY_MDB_DB;
+        const collection = process.env.MDB_ACCURACY_MDB_COLLECTION;
+        if (!mongodbUrl || !database || !collection) {
+            return null;
+        }
+
+        return new MongoDBSnapshotStorage({
+            mongodbUrl,
+            database,
+            collection,
+        });
+    }
+}
diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts
new file mode 100644
index 00000000..e0a6966d
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts
@@ -0,0 +1,127 @@
+import z from "zod";
+
+const LLMToolCallSchema = z.object({
+    toolCallId: z.string(),
+    toolName: z.string(),
+    parameters: z.record(z.string(), z.unknown()),
+});
+export type LLMToolCall = z.infer<typeof LLMToolCallSchema>;
+
+const ExpectedToolCallSchema = LLMToolCallSchema.omit({ toolCallId: true });
+export type ExpectedToolCall = z.infer<typeof ExpectedToolCallSchema>;
+
+export const AccuracyRunStatus = {
+    Done: "done",
+    Failed: "failed",
+    InProgress: "in-progress",
+} as const;
+
+export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus];
+
+export const AccuracySnapshotEntrySchema = z.object({
+    /**
+     * A unique id for each accuracy run. Should either be generated by the
+     * script triggering the accuracy run or provided via environment variables.
+     * */
+    accuracyRunId: z.string(),
+
+    /**
+     * Represents the status of accuracy run. Each test completion, during an
+     * accuracy run, is supposed to submit an accuracy snapshot entry with
+     * InProgress status which then later, after completion of accuracy run, is
+     * updated to either Done or Failed, depending on whether there were errors
+     * during the run or not. */
+    accuracyRunStatus: z
+        .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress])
+        .default(AccuracyRunStatus.InProgress),
+
+    /** Timestamp of when this snapshot entry was generated. */
+    createdOn: z.number(),
+
+    /** The commit SHA for which the accuracy run was triggered. */
+    commitSHA: z.string(),
+
+    /** The LLM provider providing the LLM APIs */
+    provider: z.string(),
+
+    /** The LLM which was requested to respond to our test prompts */
+    requestedModel: z.string(),
+
+    /** The actual prompt that was provided to LLM as test */
+    prompt: z.string(),
+
+    /** A number between 0 and 1, representing how accurately the expected tools
+     * were called by LLM when responding to the provided prompts. To know more
+     * about how this number is generated, check - toolCallingAccuracy.ts */
+    toolCallingAccuracy: z.number(),
+
+    /**
+     * A list of tools, along with their parameters, that are expected to be
+     * called by the LLM in test. */
+    expectedToolCalls: ExpectedToolCallSchema.array(),
+
+    /**
+     * A list of tools, along with their parameters, that were actually called
+     * by the LLM in test. */
+    actualToolCalls: LLMToolCallSchema.array(),
+
+    /**
+     * The total time taken by LLM to respond to our prompt. */
+    llmResponseTime: z.number(),
+
+    /**
+     * Token usage data, returned as part of LLM prompt response. */
+    tokensUsage: z
+        .object({
+            promptTokens: z.number().optional(),
+            completionTokens: z.number().optional(),
+            totalTokens: z.number().optional(),
+        })
+        .optional(),
+
+    /**
+     * The ID of the model that actually responded to our prompt request. */
+    respondingModel: z.string(),
+
+    /**
+     * The final response text generated by the LLM, in response to our prompt
+     * request. */
+    text: z.string(),
+
+    /**
+     * A list of messages, exchanged between LLM and our testing agent, in
+     * response to our prompt request. This is particularly helpful for
+     * debugging. */
+    messages: z.array(z.record(z.string(), z.unknown())),
+});
+
+export type AccuracySnapshotEntry = z.infer<typeof AccuracySnapshotEntrySchema>;
+
+export interface AccuracySnapshotStorage {
+    createSnapshotEntry(
+        snapshotEntry: Pick<
+            AccuracySnapshotEntry,
+            | "accuracyRunId"
+            | "commitSHA"
+            | "provider"
+            | "requestedModel"
+            | "prompt"
+            | "toolCallingAccuracy"
+            | "expectedToolCalls"
+            | "actualToolCalls"
+            | "llmResponseTime"
+            | "tokensUsage"
+            | "respondingModel"
+            | "text"
+            | "messages"
+        >
+    ): Promise<void>;
+
+    getLatestSnapshotForCommit(commit: string): Promise<AccuracySnapshotEntry[]>;
+
+    getSnapshotForAccuracyRun(accuracyRunId: string): Promise<AccuracySnapshotEntry[]>;
+
+    updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses): Promise<void>;
+
+    close(): Promise<void>;
+}
diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts
new file mode 100644
index 00000000..d2486942
--- /dev/null
+++ b/tests/accuracy/sdk/accuracy-testing-client.ts
@@ -0,0 +1,93 @@
+import { v4 as uuid } from "uuid";
+import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai";
+import { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
+import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
+
+import { MCP_SERVER_CLI_SCRIPT } from "./constants.js";
+import { LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
+
+type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise<CallToolResult>;
+export type MockedTools = Record<string, ToolResultGeneratorFn>;
+
+/**
+ * AccuracyTestingClient is a bridge between actual MCP client connected to our
+ * MCP server and our Tool calling agent. Its serves the following purposes:
+ * 1. Captures actual tools provided by our MCP server
+ * 2. Translates captured MCP tools to tool definitions that can be consumed by
+ *    Tool Calling agent (Ref: `vercelTools`)
+ * 3. Allow dynamic mocking and resetting of mocks of individual tool calls.
+ * 4. Records and provides tool calls made by LLMs with their parameters.
+ */
+export class AccuracyTestingClient {
+    private mockedTools: MockedTools = {};
+    private llmToolCalls: LLMToolCall[] = [];
+
+    private constructor(private readonly vercelMCPClient: Awaited<ReturnType<typeof createMCPClient>>) {}
+
+    async close() {
+        await this.vercelMCPClient?.close();
+    }
+
+    async vercelTools() {
+        const vercelTools = (await this.vercelMCPClient?.tools()) ?? {};
+        const rewrappedVercelTools: typeof vercelTools = {};
+        for (const [toolName, tool] of Object.entries(vercelTools)) {
+            rewrappedVercelTools[toolName] = createVercelTool({
+                ...tool,
+                execute: async (args, options) => {
+                    this.llmToolCalls.push({
+                        toolCallId: uuid(),
+                        toolName: toolName,
+                        parameters: args as Record<string, unknown>,
+                    });
+                    try {
+                        const toolResultGeneratorFn = this.mockedTools[toolName];
+                        if (toolResultGeneratorFn) {
+                            return await toolResultGeneratorFn(args);
+                        }
+
+                        return await tool.execute(args, options);
+                    } catch (error) {
+                        // There are cases when LLM calls the tools incorrectly
+                        // and the schema definition check fails. In production,
+                        // the tool calling agents are deployed with this fail
+                        // safe to allow LLM to course correct themselves. That
+                        // is exactly what we do here as well.
+                        return {
+                            isError: true,
+                            content: JSON.stringify(error),
+                        };
+                    }
+                },
+            });
+        }
+
+        return rewrappedVercelTools;
+    }
+
+    getLLMToolCalls() {
+        return this.llmToolCalls;
+    }
+
+    mockTools(mockedTools: MockedTools) {
+        this.mockedTools = mockedTools;
+    }
+
+    resetForTests() {
+        this.mockTools({});
+        this.llmToolCalls = [];
+    }
+
+    static async initializeClient(mdbConnectionString: string) {
+        const clientTransport = new StdioClientTransport({
+            command: process.execPath,
+            args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString],
+        });
+
+        const client = await createMCPClient({
+            transport: clientTransport,
+        });
+
+        return new AccuracyTestingClient(client);
+    }
+}
diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts
new file mode 100644
index 00000000..ee0b5f7f
--- /dev/null
+++ b/tests/accuracy/sdk/agent.ts
@@ -0,0 +1,56 @@
+import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai";
+import { Model } from "./models.js";
+
+const systemPrompt = [
+    'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119',
+    "You are an expert AI assistant with access to a set of tools for MongoDB database operations.",
+    "You MUST use the most relevant tool to answer the user's request",
+    "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments",
+    "If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.",
+    'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"',
+];
+
+// These types are not exported by Vercel SDK so we derive them here to be
+// re-used again.
+export type VercelMCPClient = Awaited<ReturnType<typeof experimental_createMCPClient>>;
+export type VercelMCPClientTools = Awaited<ReturnType<VercelMCPClient["tools"]>>;
+export type VercelAgent = ReturnType<typeof getVercelToolCallingAgent>;
+
+export interface VercelAgentPromptResult {
+    respondingModel: string;
+    tokensUsage?: {
+        promptTokens?: number;
+        completionTokens?: number;
+        totalTokens?: number;
+    };
+    text: string;
+    messages: Record<string, unknown>[];
+}
+
+// Generic interface for Agent, in case we need to switch to some other agent
+// development SDK
+export interface Agent<Model = unknown, Tools = unknown, Result = unknown> {
+    prompt(prompt: string, model: Model, tools: Tools): Promise<Result>;
+}
+
+export function getVercelToolCallingAgent(
+    requestedSystemPrompt?: string
+): Agent<Model<LanguageModelV1>, VercelMCPClientTools, VercelAgentPromptResult> {
+    return {
+        async prompt(prompt: string, model: Model<LanguageModelV1>, tools: VercelMCPClientTools) {
+            const result = await generateText({
+                model: model.getModel(),
+                system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"),
+                prompt,
+                tools,
+                maxSteps: 100,
+            });
+            return {
+                text: result.text,
+                messages: result.response.messages,
+                respondingModel: result.response.modelId,
+                tokensUsage: result.usage,
+            };
+        },
+    };
+}
diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts
new file mode 100644
index 00000000..0598b1a7
--- /dev/null
+++ b/tests/accuracy/sdk/constants.ts
@@ -0,0 +1,22 @@
+import path from "path";
+import { fileURLToPath } from "url";
+
+const __dirname = fileURLToPath(import.meta.url);
+
+export const ROOT_DIR = path.join(__dirname, "..", "..", "..", "..");
+
+export const DIST_DIR = path.join(ROOT_DIR, "dist");
+
+export const RESOURCES_DIR = path.join(ROOT_DIR, "resources");
+
+export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js");
+
+export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps");
+
+export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy");
+
+export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json");
+
+export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html");
+
+export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html");
diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts
new file mode 100644
index 00000000..2a358ce1
--- /dev/null
+++ b/tests/accuracy/sdk/describe-accuracy-tests.ts
@@ -0,0 +1,119 @@
+import { TestableModels } from "./models.js";
+import { calculateToolCallingAccuracy } from "./accuracy-scorer.js";
+import { getVercelToolCallingAgent, VercelAgent } from "./agent.js";
+import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js";
+import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js";
+import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js";
+import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js";
+import { getCommitSHA } from "./git-info.js";
+
+export interface AccuracyTestConfig {
+    /** The prompt to be provided to LLM for evaluation. */
+    prompt: string;
+
+    /**
+     * A list of tools and their parameters that we expect LLM to call based on
+     * how vague or detailed the prompt is. Ideally this should be a list of
+     * bare minimum and critical tool calls that are required to solve the
+     * problem mentioned in the prompt but because, for even a slightly vague
+     * prompt, LLM might decide to do additional confirmation by calling other
+     * tools, its fine to include those other tool calls as well to get a
+     * perfect 1 on the tool calling accuracy score. */
+    expectedToolCalls: ExpectedToolCall[];
+
+    /**
+     * The additional system prompt to be appended to already injected system
+     * prompt. */
+    systemPrompt?: string;
+
+    /**
+     * A small hint appended to the actual prompt in test, which is supposed to
+     * hint LLM to assume that the MCP server is already connected so that it
+     * does not call the connect tool.
+     * By default it is assumed to be true */
+    injectConnectedAssumption?: boolean;
+
+    /**
+     * A map of tool names to their mocked implementation. When the mocked
+     * implementations are available, the testing client will prefer those over
+     * actual MCP tool calls. */
+    mockedTools?: MockedTools;
+}
+
+export function describeAccuracyTests(models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[]) {
+    if (!process.env.MDB_ACCURACY_RUN_ID) {
+        throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!");
+    }
+
+    if (!models.length) {
+        throw new Error("No models available to test. Ensure that the API keys are properly setup!");
+    }
+
+    const eachModel = describe.each(models);
+
+    eachModel(`$displayName`, function (model) {
+        const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`;
+        const mdbIntegration = setupMongoDBIntegrationTest();
+        const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration);
+
+        let commitSHA: string;
+        let accuracySnapshotStorage: AccuracySnapshotStorage;
+        let testMCPClient: AccuracyTestingClient;
+        let agent: VercelAgent;
+
+        beforeAll(async () => {
+            const retrievedCommitSHA = await getCommitSHA();
+            if (!retrievedCommitSHA) {
+                throw new Error("Could not derive commitSHA, exiting accuracy tests!");
+            }
+            commitSHA = retrievedCommitSHA;
+
+            accuracySnapshotStorage = await getAccuracySnapshotStorage();
+            testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString());
+            agent = getVercelToolCallingAgent();
+        });
+
+        beforeEach(async () => {
+            await cleanupTestDatabases(mdbIntegration);
+            await populateTestData();
+            testMCPClient.resetForTests();
+        });
+
+        afterAll(async () => {
+            await accuracySnapshotStorage?.close();
+            await testMCPClient?.close();
+        });
+
+        const eachTest = it.each(accuracyTestConfigs);
+
+        eachTest("$prompt", async function (testConfig) {
+            testMCPClient.mockTools(testConfig.mockedTools ?? {});
+            const toolsForModel = await testMCPClient.vercelTools();
+            const promptForModel =
+                testConfig.injectConnectedAssumption === false
+                    ? testConfig.prompt
+                    : [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ");
+
+            const timeBeforePrompt = Date.now();
+            const result = await agent.prompt(promptForModel, model, toolsForModel);
+            const timeAfterPrompt = Date.now();
+
+            const llmToolCalls = testMCPClient.getLLMToolCalls();
+            const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls);
+
+            const responseTime = timeAfterPrompt - timeBeforePrompt;
+            await accuracySnapshotStorage.createSnapshotEntry({
+                accuracyRunId,
+                commitSHA,
+                provider: model.provider,
+                requestedModel: model.modelName,
+                prompt: testConfig.prompt,
+                llmResponseTime: responseTime,
+                toolCallingAccuracy: toolCallingAccuracy,
+                actualToolCalls: llmToolCalls,
+                expectedToolCalls: testConfig.expectedToolCalls,
+                ...result,
+            });
+        });
+    });
+}
diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts
new file mode 100644
index 00000000..a0918a6f
--- /dev/null
+++ b/tests/accuracy/sdk/git-info.ts
@@ -0,0 +1,12 @@
+import { simpleGit } from "simple-git";
+
+export async function getCommitSHA(): Promise<string | undefined> {
+    const commitLogs = await simpleGit().log();
+    const lastCommit = commitLogs.latest;
+    return lastCommit?.hash;
+}
+
+export async function getMergeBase(targetBranch: string, workBranchOrCommit: string): Promise<string> {
+    const result = await simpleGit().raw(["merge-base", targetBranch, workBranchOrCommit]);
+    return result.trim();
+}
diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts
new file mode 100644
index 00000000..9f47028f
--- /dev/null
+++ b/tests/accuracy/sdk/models.ts
@@ -0,0 +1,97 @@
+import { LanguageModelV1 } from "ai";
+import { createGoogleGenerativeAI } from "@himanshusinghs/google";
+import { createAzure } from "@ai-sdk/azure";
+import { createOpenAI } from "@ai-sdk/openai";
+import { ollama } from "ollama-ai-provider";
+
+export interface Model<P extends LanguageModelV1 = LanguageModelV1> {
+    readonly modelName: string;
+    readonly provider: string;
+    readonly displayName: string;
+    isAvailable(): boolean;
+    getModel(): P;
+}
+
+export class OpenAIModel implements Model {
+    readonly provider = "OpenAI";
+    readonly displayName: string;
+
+    constructor(readonly modelName: string) {
+        this.displayName = `${this.provider} - ${modelName}`;
+    }
+
+    isAvailable(): boolean {
+        return !!process.env.MDB_OPEN_AI_API_KEY;
+    }
+
+    getModel() {
+        return createOpenAI({
+            apiKey: process.env.MDB_OPEN_AI_API_KEY,
+        })(this.modelName);
+    }
+}
+
+export class AzureOpenAIModel implements Model {
+    readonly provider = "Azure";
+    readonly displayName: string;
+
+    constructor(readonly modelName: string) {
+        this.displayName = `${this.provider} - ${modelName}`;
+    }
+
+    isAvailable(): boolean {
+        return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL;
+    }
+
+    getModel() {
+        return createAzure({
+            baseURL: process.env.MDB_AZURE_OPEN_AI_API_URL,
+            apiKey: process.env.MDB_AZURE_OPEN_AI_API_KEY,
+            apiVersion: "2024-12-01-preview",
+        })(this.modelName);
+    }
+}
+
+export class GeminiModel implements Model {
+    readonly provider = "Google";
+    readonly displayName: string;
+
+    constructor(readonly modelName: string) {
+        this.displayName = `${this.provider} - ${modelName}`;
+    }
+
+    isAvailable(): boolean {
+        return !!process.env.MDB_GEMINI_API_KEY;
+    }
+
+    getModel() {
+        return createGoogleGenerativeAI({
+            apiKey: process.env.MDB_GEMINI_API_KEY,
+        })(this.modelName);
+    }
+}
+
+export class OllamaModel implements Model {
+    readonly provider = "Ollama";
+    readonly displayName: string;
+
+    constructor(readonly modelName: string) {
+        this.displayName = `${this.provider} - ${modelName}`;
+    }
+
+    isAvailable(): boolean {
+        return true;
+    }
+
+    getModel() {
+        return ollama(this.modelName);
+    }
+}
+
+const ALL_TESTABLE_MODELS = [new AzureOpenAIModel("gpt-4o")];
+
+export type TestableModels = ReturnType<typeof getAvailableModels>;
+
+export function getAvailableModels() {
+    return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable());
+}
diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json
new file mode 100644
index 00000000..f605f031
--- /dev/null
+++ b/tests/accuracy/test-data-dumps/comics.books.json
@@ -0,0 +1,417 @@
+[
+  {
+    "_id": "fa53ead3-36f3-414c-9b3a-53aa9cf5038a",
+    "title": "Configurable dedicated project",
+    "publisher": "Dark Horse Comics",
+    "release_date": "2007-03-02T00:00:00",
+    "issues": 118,
+    "main_characters": ["Stephen Shaw"],
+    "genre": ["Sci-Fi"]
+  },
+  {
+    "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948",
+    "title": "Focused intangible service-desk",
+    "publisher": "Image Comics",
+    "release_date": "1998-12-07T00:00:00",
+    "issues": 137,
+    "main_characters": ["Margaret Hogan"],
+    "genre": ["Adventure", "Horror"]
+  },
+  {
+    "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d",
+    "title": "Expanded secondary system engine",
+    "publisher": "DC Comics",
+    "release_date": "2012-12-01T00:00:00",
+    "issues": 227,
+    "main_characters": ["Joseph Cook", "Tammy Bishop"],
+    "genre": ["Superhero"]
+  },
+  {
+    "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425",
+    "title": "Customizable zero-defect Graphic Interface",
+    "publisher": "DC Comics",
+    "release_date": "2011-02-24T00:00:00",
+    "issues": 270,
+    "main_characters": ["Sandra Moss"],
+    "genre": ["Fantasy"]
+  },
+  {
+    "_id": "ea85131f-dfc8-4997-b3b0-996138185d73",
+    "title": "Reduced eco-centric help-desk",
+    "publisher": "Dark Horse Comics",
+    "release_date": "2021-03-12T00:00:00",
+    "issues": 202,
+    "main_characters": [
+      "Margaret Hogan",
+      "Angelica Stein",
+      "Tammy Murphy",
+      "Larry Hensley"
+    ],
+    "genre": ["Adventure", "Horror"]
+  },
+  {
+    "_id": "fdd56270-eb31-4456-8bf4-df81371eb290",
+    "title": "Triple-buffered dedicated help-desk",
+    "publisher": "Image Comics",
+    "release_date": "1964-09-20T00:00:00",
+    "issues": 36,
+    "main_characters": [
+      "Richard Cooper",
+      "James Sanchez",
+      "Micheal Brown",
+      "Jeremy Rice"
+    ],
+    "genre": ["Fantasy", "Action"]
+  },
+  {
+    "_id": "6de66ba4-3975-4055-824c-cda5caf517d2",
+    "title": "Operative logistical secured line",
+    "publisher": "Marvel Comics",
+    "release_date": "2007-11-19T00:00:00",
+    "issues": 55,
+    "main_characters": ["Joseph Bowman", "Robert Logan", "Ashley Watkins"],
+    "genre": ["Sci-Fi", "Horror"]
+  },
+  {
+    "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7",
+    "title": "Multi-lateral multi-state framework",
+    "publisher": "IDW Publishing",
+    "release_date": "2011-09-14T00:00:00",
+    "issues": 250,
+    "main_characters": [
+      "Ashley Watkins",
+      "Virginia Watts",
+      "Lindsay Anderson",
+      "Scott Garcia"
+    ],
+    "genre": ["Action", "Horror"]
+  },
+  {
+    "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001",
+    "title": "Re-engineered encompassing standardization",
+    "publisher": "Marvel Comics",
+    "release_date": "1987-04-16T00:00:00",
+    "issues": 235,
+    "main_characters": ["Julie Goodwin"],
+    "genre": ["Sci-Fi"]
+  },
+  {
+    "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee",
+    "title": "Fully-configurable local success",
+    "publisher": "Dark Horse Comics",
+    "release_date": "1979-09-13T00:00:00",
+    "issues": 239,
+    "main_characters": ["Chad Pham", "Lindsay Anderson", "Carlos Burton"],
+    "genre": ["Adventure"]
+  },
+  {
+    "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9",
+    "title": "Realigned zero-defect capability",
+    "publisher": "Marvel Comics",
+    "release_date": "2023-10-01T00:00:00",
+    "issues": 163,
+    "main_characters": ["Kevin Humphrey", "Maria Wright", "Virginia Watts"],
+    "genre": ["Fantasy", "Action"]
+  },
+  {
+    "_id": "fb986790-df22-4db4-8168-c76e9e9471f8",
+    "title": "Sharable bottom-line frame",
+    "publisher": "IDW Publishing",
+    "release_date": "2016-09-28T00:00:00",
+    "issues": 14,
+    "main_characters": ["Brian Vincent"],
+    "genre": ["Sci-Fi", "Fantasy"]
+  },
+  {
+    "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0",
+    "title": "Centralized next generation middleware",
+    "publisher": "Image Comics",
+    "release_date": "1970-04-16T00:00:00",
+    "issues": 5,
+    "main_characters": ["Joseph Cook"],
+    "genre": ["Fantasy"]
+  },
+  {
+    "_id": "7959187e-9693-43a1-ae2d-c168431fceb2",
+    "title": "Re-engineered heuristic array",
+    "publisher": "IDW Publishing",
+    "release_date": "2019-02-15T00:00:00",
+    "issues": 121,
+    "main_characters": ["Angelica Stein", "Benjamin Morris", "Jeremy Rice"],
+    "genre": ["Fantasy", "Action"]
+  },
+  {
+    "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c",
+    "title": "Programmable transitional collaboration",
+    "publisher": "DC Comics",
+    "release_date": "1999-08-10T00:00:00",
+    "issues": 235,
+    "main_characters": [
+      "Joseph Cook",
+      "Cynthia Brown",
+      "Carlos Burton",
+      "Micheal Brown"
+    ],
+    "genre": ["Adventure"]
+  },
+  {
+    "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836",
+    "title": "Object-based dynamic knowledgebase",
+    "publisher": "Image Comics",
+    "release_date": "1993-02-24T00:00:00",
+    "issues": 189,
+    "main_characters": [
+      "Cristian Oneal",
+      "Brian Vincent",
+      "Holly Green",
+      "James Sanchez"
+    ],
+    "genre": ["Sci-Fi", "Fantasy"]
+  },
+  {
+    "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e",
+    "title": "Enhanced asynchronous matrices",
+    "publisher": "DC Comics",
+    "release_date": "2001-03-01T00:00:00",
+    "issues": 176,
+    "main_characters": ["Justin Martinez", "Tammy Murphy"],
+    "genre": ["Action", "Fantasy"]
+  },
+  {
+    "_id": "c0fe2869-eb7d-4f09-a773-028387a54969",
+    "title": "Synergized maximized artificial intelligence",
+    "publisher": "DC Comics",
+    "release_date": "1976-09-05T00:00:00",
+    "issues": 68,
+    "main_characters": ["Christopher Elliott", "Maria Wright"],
+    "genre": ["Superhero", "Adventure"]
+  },
+  {
+    "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467",
+    "title": "Switchable bottom-line complexity",
+    "publisher": "Marvel Comics",
+    "release_date": "2012-08-12T00:00:00",
+    "issues": 156,
+    "main_characters": [
+      "Lindsay Anderson",
+      "Virginia Watts",
+      "Robert Logan",
+      "Margaret Hogan"
+    ],
+    "genre": ["Adventure"]
+  },
+  {
+    "_id": "f72be3a7-d4be-40a1-ad66-370b44759047",
+    "title": "Triple-buffered impactful customer loyalty",
+    "publisher": "Marvel Comics",
+    "release_date": "1976-09-18T00:00:00",
+    "issues": 275,
+    "main_characters": ["Sandra Moss", "Charles Blair", "Justin Martinez"],
+    "genre": ["Fantasy", "Action"]
+  },
+  {
+    "_id": "da5be16e-13e8-42d5-8954-bd89919395af",
+    "title": "Programmable 24/7 website",
+    "publisher": "DC Comics",
+    "release_date": "2023-11-06T00:00:00",
+    "issues": 278,
+    "main_characters": [
+      "Luis Callahan",
+      "Carlos Burton",
+      "Cristian Oneal",
+      "Michelle Valdez"
+    ],
+    "genre": ["Horror", "Fantasy"]
+  },
+  {
+    "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec",
+    "title": "Advanced incremental framework",
+    "publisher": "Image Comics",
+    "release_date": "2008-07-21T00:00:00",
+    "issues": 109,
+    "main_characters": ["Holly Green", "Diana Mata", "Julie Goodwin"],
+    "genre": ["Horror", "Sci-Fi"]
+  },
+  {
+    "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8",
+    "title": "Front-line coherent system engine",
+    "publisher": "DC Comics",
+    "release_date": "2012-04-27T00:00:00",
+    "issues": 297,
+    "main_characters": ["Joshua Hicks"],
+    "genre": ["Action", "Horror"]
+  },
+  {
+    "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c",
+    "title": "Progressive systematic superstructure",
+    "publisher": "Image Comics",
+    "release_date": "1996-02-20T00:00:00",
+    "issues": 295,
+    "main_characters": ["Margaret Hogan", "Christopher Elliott", "Joseph Cook"],
+    "genre": ["Fantasy", "Adventure"]
+  },
+  {
+    "_id": "338a83ad-06fc-42e1-a605-60a192ce5643",
+    "title": "Implemented national help-desk",
+    "publisher": "DC Comics",
+    "release_date": "2015-05-11T00:00:00",
+    "issues": 257,
+    "main_characters": [
+      "Lindsay Anderson",
+      "James Sanchez",
+      "Julie Goodwin",
+      "Charles Blair"
+    ],
+    "genre": ["Action"]
+  },
+  {
+    "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb",
+    "title": "Down-sized impactful workforce",
+    "publisher": "IDW Publishing",
+    "release_date": "2024-06-19T00:00:00",
+    "issues": 259,
+    "main_characters": ["Debbie Green"],
+    "genre": ["Sci-Fi", "Superhero"]
+  },
+  {
+    "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c",
+    "title": "Re-engineered leadingedge structure",
+    "publisher": "DC Comics",
+    "release_date": "2011-04-14T00:00:00",
+    "issues": 282,
+    "main_characters": [
+      "Larry Hensley",
+      "Joseph Cook",
+      "Brian Vincent",
+      "Sandra Moss"
+    ],
+    "genre": ["Adventure"]
+  },
+  {
+    "_id": "71b845f3-4416-430a-81eb-8c208f824365",
+    "title": "Cloned 3rdgeneration contingency",
+    "publisher": "Dark Horse Comics",
+    "release_date": "2002-07-11T00:00:00",
+    "issues": 238,
+    "main_characters": [
+      "Larry Hensley",
+      "Margaret Hogan",
+      "Holly Green",
+      "Joseph Bowman"
+    ],
+    "genre": ["Superhero", "Fantasy"]
+  },
+  {
+    "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112",
+    "title": "Secured zero tolerance monitoring",
+    "publisher": "DC Comics",
+    "release_date": "1969-11-30T00:00:00",
+    "issues": 104,
+    "main_characters": ["Micheal Brown"],
+    "genre": ["Horror", "Superhero"]
+  },
+  {
+    "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738",
+    "title": "Automated bifurcated access",
+    "publisher": "Image Comics",
+    "release_date": "1990-01-24T00:00:00",
+    "issues": 74,
+    "main_characters": ["Robert Logan"],
+    "genre": ["Sci-Fi"]
+  },
+  {
+    "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53",
+    "title": "Universal high-level pricing structure",
+    "publisher": "DC Comics",
+    "release_date": "1971-04-21T00:00:00",
+    "issues": 135,
+    "main_characters": ["Jeremy Rice", "Elizabeth Robinson", "James Sanchez"],
+    "genre": ["Action", "Sci-Fi"]
+  },
+  {
+    "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6",
+    "title": "Reduced optimizing strategy",
+    "publisher": "Dark Horse Comics",
+    "release_date": "1984-06-24T00:00:00",
+    "issues": 111,
+    "main_characters": ["Joshua Hicks", "Jeremy Rice", "Micheal Brown"],
+    "genre": ["Fantasy", "Superhero"]
+  },
+  {
+    "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9",
+    "title": "Virtual non-volatile groupware",
+    "publisher": "DC Comics",
+    "release_date": "2013-05-22T00:00:00",
+    "issues": 13,
+    "main_characters": ["Luis Callahan", "Tammy Bishop", "Cynthia Brown"],
+    "genre": ["Action"]
+  },
+  {
+    "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b",
+    "title": "Horizontal disintermediate extranet",
+    "publisher": "DC Comics",
+    "release_date": "2021-12-03T00:00:00",
+    "issues": 129,
+    "main_characters": ["Margaret Hogan"],
+    "genre": ["Action"]
+  },
+  {
+    "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8",
+    "title": "Cross-platform discrete framework",
+    "publisher": "Dark Horse Comics",
+    "release_date": "2001-08-02T00:00:00",
+    "issues": 38,
+    "main_characters": ["James Sanchez", "Larry Hensley"],
+    "genre": ["Superhero"]
+  },
+  {
+    "_id": "05d637ed-3942-4276-a885-7b3363dd48e2",
+    "title": "Cross-platform regional info-mediaries",
+    "publisher": "Image Comics",
+    "release_date": "2005-03-30T00:00:00",
+    "issues": 150,
+    "main_characters": ["Carlos Burton"],
+    "genre": ["Superhero", "Fantasy"]
+  },
+  {
+    "_id": "88904f06-50a6-44f1-bccc-f379a9788611",
+    "title": "Mandatory 6thgeneration secured line",
+    "publisher": "Image Comics",
+    "release_date": "2021-06-27T00:00:00",
+    "issues": 262,
+    "main_characters": ["Luis Callahan"],
+    "genre": ["Sci-Fi", "Superhero"]
+  },
+  {
+    "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c",
+    "title": "Exclusive interactive concept",
+    "publisher": "IDW Publishing",
+    "release_date": "1969-06-03T00:00:00",
+    "issues": 264,
+    "main_characters": ["Scott Garcia", "Joseph Bowman"],
+    "genre": ["Fantasy", "Superhero"]
+  },
+  {
+    "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c",
+    "title": "Focused intermediate methodology",
+    "publisher": "DC Comics",
+    "release_date": "2004-03-19T00:00:00",
+    "issues": 210,
+    "main_characters": [
+      "Justin Martinez",
+      "Julie Goodwin",
+      "Benjamin Morris",
+      "Virginia Watts"
+    ],
+    "genre": ["Adventure", "Action"]
+  },
+  {
+    "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250",
+    "title": "Right-sized contextually-based toolset",
+    "publisher": "IDW Publishing",
+    "release_date": "2007-12-27T00:00:00",
+    "issues": 117,
+    "main_characters": ["Debbie Green", "Christopher Elliott", "Joshua Hicks"],
+    "genre": ["Sci-Fi", "Action"]
+  }
+]
diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json
new file mode 100644
index 00000000..4a255f48
--- /dev/null
+++ b/tests/accuracy/test-data-dumps/comics.characters.json
@@ -0,0 +1,402 @@
+[
+  {
+    "_id": "d7047787-abea-40fa-b78e-939925fd3589",
+    "name": "Elizabeth Robinson",
+    "alias": "ashley62",
+    "powers": ["Shapeshifting", "Telepathy", "Flight"],
+    "first_appearance": "1961-06-23T00:00:00",
+    "affiliations": ["Fantastic Four", "X-Men"],
+    "origin": "Earth",
+    "is_villain": false
+  },
+  {
+    "_id": "06ac8173-51a6-404c-8f9a-628de889b1de",
+    "name": "Joshua Wang",
+    "alias": "paulasmith",
+    "powers": ["Telekinesis"],
+    "first_appearance": "1987-04-16T00:00:00",
+    "affiliations": ["Fantastic Four", "Justice League"],
+    "origin": "Earth",
+    "is_villain": true
+  },
+  {
+    "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959",
+    "name": "Stephen Shaw",
+    "alias": "adamskenneth",
+    "powers": ["Super Speed", "Flight"],
+    "first_appearance": "2004-07-26T00:00:00",
+    "affiliations": [],
+    "origin": "Atlantis",
+    "is_villain": true
+  },
+  {
+    "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197",
+    "name": "Joseph Bowman",
+    "alias": "amysalazar",
+    "powers": ["Time Manipulation"],
+    "first_appearance": "1961-07-03T00:00:00",
+    "affiliations": ["Teen Titans", "Avengers"],
+    "origin": "Atlantis",
+    "is_villain": true
+  },
+  {
+    "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e",
+    "name": "Debbie Green",
+    "alias": "steventodd",
+    "powers": ["Energy Blasts", "Regeneration"],
+    "first_appearance": "2021-12-05T00:00:00",
+    "affiliations": [],
+    "origin": "Asgard",
+    "is_villain": false
+  },
+  {
+    "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f",
+    "name": "Christopher Elliott",
+    "alias": "barajasmitchell",
+    "powers": ["Flight", "Invisibility", "Telekinesis"],
+    "first_appearance": "1947-03-23T00:00:00",
+    "affiliations": [],
+    "origin": "Earth",
+    "is_villain": false
+  },
+  {
+    "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220",
+    "name": "Tammy Murphy",
+    "alias": "jessicagill",
+    "powers": ["Super Strength", "Telekinesis"],
+    "first_appearance": "2000-07-06T00:00:00",
+    "affiliations": [],
+    "origin": "Mutant",
+    "is_villain": false
+  },
+  {
+    "_id": "817c0b11-3eac-4a3a-b55f-203126db060f",
+    "name": "Scott Garcia",
+    "alias": "whitechristie",
+    "powers": ["Telepathy", "Energy Blasts"],
+    "first_appearance": "2000-11-22T00:00:00",
+    "affiliations": [],
+    "origin": "Asgard",
+    "is_villain": false
+  },
+  {
+    "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a",
+    "name": "Julie Goodwin",
+    "alias": "robertsmith",
+    "powers": ["Telepathy", "Super Speed"],
+    "first_appearance": "1953-08-09T00:00:00",
+    "affiliations": ["Teen Titans"],
+    "origin": "Mutant",
+    "is_villain": true
+  },
+  {
+    "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a",
+    "name": "Joshua Hicks",
+    "alias": "cynthia32",
+    "powers": ["Super Strength", "Invisibility", "Telekinesis"],
+    "first_appearance": "1967-07-17T00:00:00",
+    "affiliations": [],
+    "origin": "Krypton",
+    "is_villain": false
+  },
+  {
+    "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e",
+    "name": "Justin Martinez",
+    "alias": "janicebrown",
+    "powers": ["Super Speed", "Super Strength"],
+    "first_appearance": "1973-09-19T00:00:00",
+    "affiliations": ["Avengers"],
+    "origin": "Mutant",
+    "is_villain": true
+  },
+  {
+    "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14",
+    "name": "Holly Green",
+    "alias": "ystanley",
+    "powers": ["Shapeshifting", "Energy Blasts"],
+    "first_appearance": "2013-08-05T00:00:00",
+    "affiliations": [],
+    "origin": "Krypton",
+    "is_villain": true
+  },
+  {
+    "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466",
+    "name": "Margaret Hogan",
+    "alias": "wendyconway",
+    "powers": ["Super Speed", "Telepathy"],
+    "first_appearance": "1944-08-13T00:00:00",
+    "affiliations": ["Justice League", "X-Men"],
+    "origin": "Earth",
+    "is_villain": false
+  },
+  {
+    "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc",
+    "name": "Ashley Watkins",
+    "alias": "cjohnson",
+    "powers": ["Shapeshifting"],
+    "first_appearance": "1940-09-13T00:00:00",
+    "affiliations": ["Fantastic Four", "Guardians of the Galaxy"],
+    "origin": "Mutant",
+    "is_villain": true
+  },
+  {
+    "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff",
+    "name": "Tammy Bishop",
+    "alias": "geoffreyryan",
+    "powers": ["Regeneration"],
+    "first_appearance": "1984-11-04T00:00:00",
+    "affiliations": ["Fantastic Four", "X-Men"],
+    "origin": "Earth",
+    "is_villain": true
+  },
+  {
+    "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f",
+    "name": "Michelle Valdez",
+    "alias": "manuelcobb",
+    "powers": ["Regeneration", "Energy Blasts"],
+    "first_appearance": "2014-08-04T00:00:00",
+    "affiliations": ["Teen Titans"],
+    "origin": "Mutant",
+    "is_villain": false
+  },
+  {
+    "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7",
+    "name": "Joseph Cook",
+    "alias": "scott40",
+    "powers": ["Telepathy", "Telekinesis"],
+    "first_appearance": "1976-04-01T00:00:00",
+    "affiliations": [],
+    "origin": "Earth",
+    "is_villain": true
+  },
+  {
+    "_id": "0738b98f-4699-4609-9156-fb6a1085a503",
+    "name": "Jeremy Rice",
+    "alias": "james82",
+    "powers": ["Invisibility"],
+    "first_appearance": "1977-09-22T00:00:00",
+    "affiliations": [],
+    "origin": "Asgard",
+    "is_villain": false
+  },
+  {
+    "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a",
+    "name": "Chad Pham",
+    "alias": "smithjennifer",
+    "powers": ["Telepathy"],
+    "first_appearance": "2001-05-26T00:00:00",
+    "affiliations": ["Teen Titans"],
+    "origin": "Mars",
+    "is_villain": false
+  },
+  {
+    "_id": "d545ec48-680c-4493-8650-d759bedabb7e",
+    "name": "Diana Mata",
+    "alias": "zwilliamson",
+    "powers": ["Super Speed", "Energy Blasts", "Invisibility"],
+    "first_appearance": "2010-11-21T00:00:00",
+    "affiliations": [],
+    "origin": "Mars",
+    "is_villain": false
+  },
+  {
+    "_id": "e6bfb576-d65c-40f8-a547-90719578e03c",
+    "name": "Maria Wright",
+    "alias": "yraymond",
+    "powers": ["Flight", "Telepathy"],
+    "first_appearance": "1971-04-15T00:00:00",
+    "affiliations": ["Avengers", "Teen Titans"],
+    "origin": "Asgard",
+    "is_villain": true
+  },
+  {
+    "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea",
+    "name": "Carlos Burton",
+    "alias": "rperkins",
+    "powers": ["Super Speed", "Time Manipulation", "Telekinesis"],
+    "first_appearance": "1970-01-20T00:00:00",
+    "affiliations": ["Teen Titans"],
+    "origin": "Mutant",
+    "is_villain": true
+  },
+  {
+    "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c",
+    "name": "Lindsay Anderson",
+    "alias": "amycox",
+    "powers": ["Super Strength", "Telekinesis"],
+    "first_appearance": "1976-04-30T00:00:00",
+    "affiliations": [],
+    "origin": "Atlantis",
+    "is_villain": false
+  },
+  {
+    "_id": "cdc66356-a438-4989-b4d1-315609ec6d91",
+    "name": "Larry Hensley",
+    "alias": "ylester",
+    "powers": ["Super Strength", "Invisibility", "Shapeshifting"],
+    "first_appearance": "2019-01-21T00:00:00",
+    "affiliations": ["Guardians of the Galaxy", "Avengers"],
+    "origin": "Asgard",
+    "is_villain": false
+  },
+  {
+    "_id": "0952b684-f887-446f-afcb-71d2ace3fd32",
+    "name": "Sandra Moss",
+    "alias": "alexandra81",
+    "powers": ["Telekinesis", "Super Speed"],
+    "first_appearance": "1989-07-28T00:00:00",
+    "affiliations": [],
+    "origin": "Earth",
+    "is_villain": false
+  },
+  {
+    "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc",
+    "name": "Cynthia Brown",
+    "alias": "freed",
+    "powers": ["Super Strength", "Energy Blasts"],
+    "first_appearance": "2015-06-19T00:00:00",
+    "affiliations": ["Fantastic Four"],
+    "origin": "Mars",
+    "is_villain": false
+  },
+  {
+    "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6",
+    "name": "Brian Vincent",
+    "alias": "ghowell",
+    "powers": ["Invisibility", "Flight", "Super Speed"],
+    "first_appearance": "2012-05-12T00:00:00",
+    "affiliations": [],
+    "origin": "Asgard",
+    "is_villain": false
+  },
+  {
+    "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f",
+    "name": "Kevin Humphrey",
+    "alias": "mary44",
+    "powers": ["Super Strength", "Super Speed", "Telepathy"],
+    "first_appearance": "1993-05-10T00:00:00",
+    "affiliations": ["Justice League", "Teen Titans"],
+    "origin": "Mutant",
+    "is_villain": true
+  },
+  {
+    "_id": "c147036a-ab66-4023-a950-1fb81acf7dca",
+    "name": "Luis Callahan",
+    "alias": "ashleyreeves",
+    "powers": ["Telekinesis"],
+    "first_appearance": "1943-11-02T00:00:00",
+    "affiliations": ["X-Men"],
+    "origin": "Krypton",
+    "is_villain": false
+  },
+  {
+    "_id": "c42cec2b-156d-481e-993b-aa93637ae76e",
+    "name": "Micheal Brown",
+    "alias": "lisa85",
+    "powers": ["Telepathy", "Flight", "Time Manipulation"],
+    "first_appearance": "1983-11-04T00:00:00",
+    "affiliations": [],
+    "origin": "Krypton",
+    "is_villain": false
+  },
+  {
+    "_id": "5bd85192-926b-42f3-bc18-afd40a53753e",
+    "name": "James Sanchez",
+    "alias": "mary95",
+    "powers": ["Energy Blasts", "Telekinesis"],
+    "first_appearance": "1999-05-20T00:00:00",
+    "affiliations": ["Justice League"],
+    "origin": "Atlantis",
+    "is_villain": false
+  },
+  {
+    "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367",
+    "name": "Richard Cooper",
+    "alias": "james85",
+    "powers": ["Telekinesis", "Energy Blasts", "Super Speed"],
+    "first_appearance": "2021-11-27T00:00:00",
+    "affiliations": ["Justice League", "Fantastic Four"],
+    "origin": "Mars",
+    "is_villain": true
+  },
+  {
+    "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0",
+    "name": "Charles Blair",
+    "alias": "barbara60",
+    "powers": ["Super Strength"],
+    "first_appearance": "2012-05-03T00:00:00",
+    "affiliations": [],
+    "origin": "Krypton",
+    "is_villain": false
+  },
+  {
+    "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d",
+    "name": "Virginia Watts",
+    "alias": "klane",
+    "powers": ["Telekinesis"],
+    "first_appearance": "2016-04-27T00:00:00",
+    "affiliations": [],
+    "origin": "Earth",
+    "is_villain": false
+  },
+  {
+    "_id": "495f64a9-123e-46d4-9ddb-21692353a849",
+    "name": "Robert Logan",
+    "alias": "griffinsean",
+    "powers": ["Telepathy"],
+    "first_appearance": "2003-07-16T00:00:00",
+    "affiliations": [],
+    "origin": "Krypton",
+    "is_villain": false
+  },
+  {
+    "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9",
+    "name": "Cheyenne Powell",
+    "alias": "laurenolsen",
+    "powers": ["Time Manipulation", "Energy Blasts"],
+    "first_appearance": "1964-02-05T00:00:00",
+    "affiliations": [],
+    "origin": "Atlantis",
+    "is_villain": false
+  },
+  {
+    "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b",
+    "name": "Benjamin Morris",
+    "alias": "sierra18",
+    "powers": ["Telekinesis", "Regeneration", "Shapeshifting"],
+    "first_appearance": "1964-09-27T00:00:00",
+    "affiliations": ["X-Men", "Avengers"],
+    "origin": "Mars",
+    "is_villain": false
+  },
+  {
+    "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e",
+    "name": "Cristian Oneal",
+    "alias": "harrellamy",
+    "powers": ["Super Speed"],
+    "first_appearance": "1965-01-29T00:00:00",
+    "affiliations": [],
+    "origin": "Mutant",
+    "is_villain": false
+  },
+  {
+    "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36",
+    "name": "Jessica Vargas",
+    "alias": "chadherrera",
+    "powers": ["Energy Blasts", "Super Strength", "Telekinesis"],
+    "first_appearance": "1974-03-29T00:00:00",
+    "affiliations": ["X-Men", "Teen Titans"],
+    "origin": "Earth",
+    "is_villain": true
+  },
+  {
+    "_id": "f3fa712d-2124-433a-b405-c02757fa1503",
+    "name": "Angelica Stein",
+    "alias": "reedjason",
+    "powers": ["Invisibility"],
+    "first_appearance": "1981-01-02T00:00:00",
+    "affiliations": ["Avengers"],
+    "origin": "Earth",
+    "is_villain": true
+  }
+]
diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json
new file mode 100644
index 00000000..3c492185
--- /dev/null
+++ b/tests/accuracy/test-data-dumps/mflix.movies.json
@@ -0,0 +1,496 @@
+[
+  {
+    "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4",
+    "title": "Human sell",
+    "release_year": 1993,
+    "genres": ["Sci-Fi"],
+    "director": "Christina Collins",
+    "cast": ["Jeremy Marks", "Matthew Moore", "Erica Miller", "Beth Morales"],
+    "runtime": 139,
+    "rating": 9.3
+  },
+  {
+    "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c",
+    "title": "Trial we much",
+    "release_year": 2020,
+    "genres": ["Horror", "Comedy"],
+    "director": "Steven Miles",
+    "cast": [
+      "Patrick Huynh",
+      "Darrell Thompson",
+      "Lindsay Thompson",
+      "Brandi Cooper"
+    ],
+    "runtime": 149,
+    "rating": 5.0
+  },
+  {
+    "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db",
+    "title": "Someone",
+    "release_year": 1996,
+    "genres": ["Action", "Horror"],
+    "director": "Steven Miles",
+    "cast": [
+      "Carrie Cummings",
+      "Patricia Rice",
+      "Suzanne Collins",
+      "April Murray",
+      "Kimberly Shaw"
+    ],
+    "runtime": 153,
+    "rating": 2.6
+  },
+  {
+    "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42",
+    "title": "Without our",
+    "release_year": 2012,
+    "genres": ["Comedy"],
+    "director": "Christina Collins",
+    "cast": [
+      "Rodney Gray",
+      "Mr. Joseph Allen",
+      "Heather Robles",
+      "Eric Edwards",
+      "James Wilson"
+    ],
+    "runtime": 143,
+    "rating": 9.1
+  },
+  {
+    "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7",
+    "title": "Cost anything",
+    "release_year": 2002,
+    "genres": ["Romance", "Action"],
+    "director": "Bryan Andrews",
+    "cast": ["Gregory Mullins", "Jillian Arroyo", "Angela Reed"],
+    "runtime": 112,
+    "rating": 3.8
+  },
+  {
+    "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138",
+    "title": "Hold green energy their",
+    "release_year": 1989,
+    "genres": ["Horror"],
+    "director": "Christina Collins",
+    "cast": [
+      "Eduardo Carey",
+      "Jodi Miller",
+      "Ronald Johnson",
+      "Lindsay Hernandez"
+    ],
+    "runtime": 126,
+    "rating": 7.4
+  },
+  {
+    "_id": "1b81c45b-1d09-47dc-871f-ace109107446",
+    "title": "Choose ability start",
+    "release_year": 1990,
+    "genres": ["Drama", "Comedy"],
+    "director": "Bryan Andrews",
+    "cast": [
+      "Tyler Daniels",
+      "Gregory Harris",
+      "Whitney Swanson",
+      "Pamela Ramirez"
+    ],
+    "runtime": 141,
+    "rating": 5.6
+  },
+  {
+    "_id": "400a08be-f07b-416a-8cdc-46c9886b812b",
+    "title": "Cover perhaps",
+    "release_year": 2022,
+    "genres": ["Drama"],
+    "director": "Daniel Wallace",
+    "cast": ["Victoria Price", "Holly Ross", "Michele Jones"],
+    "runtime": 173,
+    "rating": 4.3
+  },
+  {
+    "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f",
+    "title": "Policy particularly",
+    "release_year": 2003,
+    "genres": ["Comedy"],
+    "director": "Brittany Parker",
+    "cast": ["Emily Haynes", "Crystal Johnson", "Ernest Jones"],
+    "runtime": 154,
+    "rating": 6.6
+  },
+  {
+    "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704",
+    "title": "Store care",
+    "release_year": 2017,
+    "genres": ["Romance", "Sci-Fi"],
+    "director": "Sara Stewart",
+    "cast": [
+      "Katherine Matthews",
+      "Stacey Wolf",
+      "Laurie Blackwell",
+      "Luis Ortiz",
+      "Christopher Vasquez"
+    ],
+    "runtime": 168,
+    "rating": 7.7
+  },
+  {
+    "_id": "99e75e60-6466-4314-92c3-00c433a06600",
+    "title": "Section close bad",
+    "release_year": 2024,
+    "genres": ["Drama", "Comedy"],
+    "director": "Bryan Andrews",
+    "cast": [
+      "Heather Marshall",
+      "Alexander Austin",
+      "Stephanie Villarreal MD",
+      "Ryan Marquez"
+    ],
+    "runtime": 180,
+    "rating": 7.7
+  },
+  {
+    "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273",
+    "title": "Become stand",
+    "release_year": 2001,
+    "genres": ["Sci-Fi", "Thriller"],
+    "director": "Brian Martinez",
+    "cast": ["Robert Ross", "Kimberly Williamson", "Pam Wyatt"],
+    "runtime": 162,
+    "rating": 1.5
+  },
+  {
+    "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0",
+    "title": "I case",
+    "release_year": 2012,
+    "genres": ["Drama", "Comedy"],
+    "director": "Brittany Parker",
+    "cast": [
+      "Justin Davis",
+      "Karen Doyle",
+      "Daniel Jackson",
+      "Courtney Mcdonald"
+    ],
+    "runtime": 122,
+    "rating": 3.1
+  },
+  {
+    "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760",
+    "title": "No organization style",
+    "release_year": 2013,
+    "genres": ["Comedy"],
+    "director": "Christina Collins",
+    "cast": ["Benjamin Whitney", "Joseph Bush", "Barbara Griffin"],
+    "runtime": 167,
+    "rating": 9.6
+  },
+  {
+    "_id": "15855c7b-ece2-4238-b995-57f6207509ea",
+    "title": "Computer garden",
+    "release_year": 2012,
+    "genres": ["Horror"],
+    "director": "Steven Miles",
+    "cast": ["Darlene Lee", "Tina Wang", "Nathan Mayo"],
+    "runtime": 146,
+    "rating": 6.5
+  },
+  {
+    "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67",
+    "title": "Trip information feel",
+    "release_year": 2008,
+    "genres": ["Action", "Thriller"],
+    "director": "Brittany Parker",
+    "cast": ["Kelly Walsh", "Michael Rocha"],
+    "runtime": 148,
+    "rating": 9.8
+  },
+  {
+    "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b",
+    "title": "It project low part",
+    "release_year": 1992,
+    "genres": ["Horror"],
+    "director": "Christina Collins",
+    "cast": [
+      "Sheena Murphy",
+      "Amanda Miller",
+      "Erica Curtis",
+      "Roger Jones",
+      "Andrew Simpson"
+    ],
+    "runtime": 161,
+    "rating": 2.4
+  },
+  {
+    "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a",
+    "title": "Near attorney discuss",
+    "release_year": 1983,
+    "genres": ["Comedy"],
+    "director": "Christina Collins",
+    "cast": [
+      "Chase Myers",
+      "Benjamin Kelly",
+      "Thomas Summers MD",
+      "Jessica Woods"
+    ],
+    "runtime": 174,
+    "rating": 9.5
+  },
+  {
+    "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549",
+    "title": "Whether know",
+    "release_year": 2009,
+    "genres": ["Comedy", "Thriller"],
+    "director": "Bryan Andrews",
+    "cast": ["Amy Reed", "William Williams", "Steven Lawrence"],
+    "runtime": 134,
+    "rating": 9.6
+  },
+  {
+    "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19",
+    "title": "Against place",
+    "release_year": 2017,
+    "genres": ["Drama", "Romance"],
+    "director": "Daniel Wallace",
+    "cast": [
+      "Brittany Thompson",
+      "Clinton Bishop",
+      "Terri Meyer",
+      "Stacey Phillips",
+      "Alexander Hunt"
+    ],
+    "runtime": 152,
+    "rating": 5.0
+  },
+  {
+    "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5",
+    "title": "Return yard",
+    "release_year": 1994,
+    "genres": ["Horror"],
+    "director": "Christina Collins",
+    "cast": ["Mason Lara", "Taylor Salinas", "Tim Foster", "Erin Sharp"],
+    "runtime": 99,
+    "rating": 8.8
+  },
+  {
+    "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992",
+    "title": "Certain fish",
+    "release_year": 2009,
+    "genres": ["Romance"],
+    "director": "Steven Miles",
+    "cast": [
+      "Jonathan King",
+      "Caitlyn Costa DDS",
+      "Steve Davis",
+      "Perry Anderson"
+    ],
+    "runtime": 130,
+    "rating": 8.6
+  },
+  {
+    "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32",
+    "title": "Agreement like program",
+    "release_year": 2004,
+    "genres": ["Sci-Fi"],
+    "director": "Daniel Jackson",
+    "cast": [
+      "Ashley Green",
+      "Rebecca Osborne",
+      "Robert Williams",
+      "Breanna Dunn",
+      "Philip Vargas"
+    ],
+    "runtime": 110,
+    "rating": 8.1
+  },
+  {
+    "_id": "791688be-4358-45ab-956e-71fe3fd35d19",
+    "title": "Floor seven then",
+    "release_year": 2009,
+    "genres": ["Horror"],
+    "director": "Daniel Wallace",
+    "cast": ["Dustin Wright", "Crystal Young"],
+    "runtime": 143,
+    "rating": 4.8
+  },
+  {
+    "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474",
+    "title": "Like rather paper",
+    "release_year": 2006,
+    "genres": ["Drama"],
+    "director": "Spencer Gillespie",
+    "cast": ["Sean Moyer", "James Edwards", "Tara Lee", "Robert Scott"],
+    "runtime": 175,
+    "rating": 9.1
+  },
+  {
+    "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca",
+    "title": "Argue hospital",
+    "release_year": 1994,
+    "genres": ["Romance", "Sci-Fi"],
+    "director": "Amanda Young",
+    "cast": [
+      "Carolyn Williams",
+      "Jasmin Sampson",
+      "Phillip Levy",
+      "Brenda Clark",
+      "Lauren Perry"
+    ],
+    "runtime": 149,
+    "rating": 9.5
+  },
+  {
+    "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601",
+    "title": "Become after card",
+    "release_year": 1986,
+    "genres": ["Sci-Fi", "Horror"],
+    "director": "Brian Martinez",
+    "cast": ["Rhonda Ochoa", "Charlene Castillo"],
+    "runtime": 100,
+    "rating": 8.5
+  },
+  {
+    "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06",
+    "title": "Born authority attention",
+    "release_year": 1994,
+    "genres": ["Romance"],
+    "director": "Brian Martinez",
+    "cast": ["Matthew Thomas", "Carly Perkins"],
+    "runtime": 131,
+    "rating": 4.9
+  },
+  {
+    "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311",
+    "title": "Local seven media",
+    "release_year": 1998,
+    "genres": ["Sci-Fi", "Drama"],
+    "director": "Amanda Young",
+    "cast": ["Jessica Perez", "Larry Atkinson"],
+    "runtime": 95,
+    "rating": 2.0
+  },
+  {
+    "_id": "498597d2-3254-46ef-a800-f322a86fbd55",
+    "title": "Keep employee",
+    "release_year": 1981,
+    "genres": ["Horror"],
+    "director": "Christina Collins",
+    "cast": ["Alexis Carlson", "Andrew Stewart"],
+    "runtime": 161,
+    "rating": 6.0
+  },
+  {
+    "_id": "788d9343-6908-4762-88ee-b04aba1e58b5",
+    "title": "American question generation",
+    "release_year": 1986,
+    "genres": ["Romance"],
+    "director": "Daniel Jackson",
+    "cast": ["Troy Carter", "Peter Hernandez", "Christine Brown"],
+    "runtime": 176,
+    "rating": 8.0
+  },
+  {
+    "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a",
+    "title": "Maintain out",
+    "release_year": 2000,
+    "genres": ["Sci-Fi", "Action"],
+    "director": "Brian Martinez",
+    "cast": ["Nancy Evans", "Michael Gill", "Justin Carroll"],
+    "runtime": 179,
+    "rating": 10.0
+  },
+  {
+    "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f",
+    "title": "Ten box study",
+    "release_year": 2011,
+    "genres": ["Horror", "Romance"],
+    "director": "Steven Miles",
+    "cast": [
+      "Mark Hicks",
+      "Michelle Dean",
+      "John Buchanan",
+      "Veronica Johnson"
+    ],
+    "runtime": 147,
+    "rating": 2.5
+  },
+  {
+    "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4",
+    "title": "Production operation",
+    "release_year": 2014,
+    "genres": ["Horror", "Romance"],
+    "director": "Sara Stewart",
+    "cast": ["Ashley Mata", "Mark Kelly", "John West", "Harold Day"],
+    "runtime": 125,
+    "rating": 4.1
+  },
+  {
+    "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92",
+    "title": "What language",
+    "release_year": 2004,
+    "genres": ["Sci-Fi"],
+    "director": "Sara Stewart",
+    "cast": [
+      "Scott Mckenzie",
+      "Jason Lee",
+      "Nathan Gardner",
+      "Jamie Greene",
+      "Angela Garner"
+    ],
+    "runtime": 177,
+    "rating": 3.7
+  },
+  {
+    "_id": "b32dd176-938b-4ded-823a-311423fdc2ea",
+    "title": "Up usually central",
+    "release_year": 2011,
+    "genres": ["Sci-Fi", "Comedy"],
+    "director": "Daniel Jackson",
+    "cast": ["Jennifer Carlson", "Jonathan Stewart DDS", "Amy Lester"],
+    "runtime": 159,
+    "rating": 5.6
+  },
+  {
+    "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f",
+    "title": "For boy only",
+    "release_year": 1987,
+    "genres": ["Thriller", "Action"],
+    "director": "Sara Stewart",
+    "cast": ["Gene Smith", "Robert Osborne Jr.", "Laura Fox", "Alexis Lowe"],
+    "runtime": 95,
+    "rating": 3.6
+  },
+  {
+    "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c",
+    "title": "Site win including your",
+    "release_year": 2008,
+    "genres": ["Sci-Fi"],
+    "director": "Spencer Gillespie",
+    "cast": [
+      "John Williams",
+      "Jason Huang",
+      "Karen Klein",
+      "Gary Tran",
+      "Jessica Murphy"
+    ],
+    "runtime": 178,
+    "rating": 6.2
+  },
+  {
+    "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972",
+    "title": "Sell huge hair",
+    "release_year": 1997,
+    "genres": ["Thriller", "Action"],
+    "director": "Bryan Andrews",
+    "cast": ["Thomas Johnson", "Ryan Morrow"],
+    "runtime": 157,
+    "rating": 4.4
+  },
+  {
+    "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982",
+    "title": "Guy rest",
+    "release_year": 1997,
+    "genres": ["Sci-Fi", "Horror"],
+    "director": "Steven Miles",
+    "cast": ["Michael Fox", "Tyler Acosta", "Tracy Adams"],
+    "runtime": 122,
+    "rating": 7.8
+  }
+]
diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json
new file mode 100644
index 00000000..2edc7fa7
--- /dev/null
+++ b/tests/accuracy/test-data-dumps/mflix.shows.json
@@ -0,0 +1,572 @@
+[
+  {
+    "_id": "b586e37c-6b32-417d-a53c-2a4c1121b11b",
+    "title": "Object-based analyzing architecture",
+    "seasons": 8,
+    "episodes": 62,
+    "platform": "Amazon Prime",
+    "genres": ["Comedy"],
+    "cast": [
+      "Roger Gomez",
+      "Sandra Williams",
+      "Matthew Rodriguez",
+      "Scott Brown",
+      "Kristie Horn",
+      "Nicole Avila"
+    ],
+    "start_year": 2014,
+    "end_year": null
+  },
+  {
+    "_id": "c28471ea-336f-4060-9b18-0bbff3de6622",
+    "title": "Customer-focused encompassing architecture",
+    "seasons": 4,
+    "episodes": 108,
+    "platform": "Hulu",
+    "genres": ["Thriller"],
+    "cast": ["Joseph Holmes", "Patrick Smith", "Charles Delacruz"],
+    "start_year": 2001,
+    "end_year": null
+  },
+  {
+    "_id": "93f0969b-2377-4531-9c4e-45d2593015cd",
+    "title": "User-centric background approach",
+    "seasons": 6,
+    "episodes": 49,
+    "platform": "HBO",
+    "genres": ["Comedy", "Documentary"],
+    "cast": [
+      "Jason Castillo",
+      "Jessica Burke",
+      "Philip Lewis",
+      "Philip Goodman",
+      "Corey Lee"
+    ],
+    "start_year": 2016,
+    "end_year": 2018
+  },
+  {
+    "_id": "a0b76db0-99a1-49fe-a5ea-fe802a66bde9",
+    "title": "Networked directional budgetary management",
+    "seasons": 5,
+    "episodes": 23,
+    "platform": "Amazon Prime",
+    "genres": ["Comedy", "Thriller"],
+    "cast": ["Mark Allen", "Anthony Snyder", "Kimberly Jones"],
+    "start_year": 2002,
+    "end_year": null
+  },
+  {
+    "_id": "fbdef9b9-1ad4-4a6b-a39a-2e0b90423cb5",
+    "title": "Enterprise-wide dynamic intranet",
+    "seasons": 1,
+    "episodes": 12,
+    "platform": "Amazon Prime",
+    "genres": ["Crime", "Documentary"],
+    "cast": ["Matthew Green", "Kelly Wright", "Tonya Sullivan", "Daniel Brown"],
+    "start_year": 2009,
+    "end_year": 2020
+  },
+  {
+    "_id": "db54ab5c-bf6b-48ea-8272-1b1a4a76b848",
+    "title": "Exclusive real-time access",
+    "seasons": 10,
+    "episodes": 76,
+    "platform": "Amazon Prime",
+    "genres": ["Drama"],
+    "cast": ["Stacey Shaw", "Zachary Steele", "Laurie Martinez"],
+    "start_year": 2011,
+    "end_year": 2020
+  },
+  {
+    "_id": "53869b62-c8c7-48b3-86c9-17c935b43ff6",
+    "title": "Persevering leadingedge application",
+    "seasons": 5,
+    "episodes": 73,
+    "platform": "HBO",
+    "genres": ["Thriller"],
+    "cast": ["Diane Boyd", "Anna Rubio", "Cheryl Fisher", "Tyler Villa"],
+    "start_year": 2008,
+    "end_year": 2020
+  },
+  {
+    "_id": "3be07c4d-5275-4181-b2f6-5b1a1e46aa7b",
+    "title": "Multi-lateral analyzing model",
+    "seasons": 2,
+    "episodes": 114,
+    "platform": "Amazon Prime",
+    "genres": ["Fantasy"],
+    "cast": [
+      "Kathleen Marshall",
+      "Kimberly Quinn",
+      "Steven Parker",
+      "Adrienne Green",
+      "Justin Hughes",
+      "Jean Smith"
+    ],
+    "start_year": 2017,
+    "end_year": 2023
+  },
+  {
+    "_id": "50cb455b-5ec0-4e68-8601-43e58defb762",
+    "title": "User-centric tangible monitoring",
+    "seasons": 3,
+    "episodes": 55,
+    "platform": "Disney+",
+    "genres": ["Drama"],
+    "cast": [
+      "Barbara Clark",
+      "Carolyn Scott",
+      "Timothy Reed",
+      "Cory Burton",
+      "Jacob Hill"
+    ],
+    "start_year": 2006,
+    "end_year": 2012
+  },
+  {
+    "_id": "bab2dba4-88bd-4b24-afce-8781eb280d53",
+    "title": "Persevering background monitoring",
+    "seasons": 4,
+    "episodes": 61,
+    "platform": "Amazon Prime",
+    "genres": ["Comedy", "Fantasy"],
+    "cast": ["Adam Lin", "Evan Smith", "Christine Howard", "Ruben Hopkins"],
+    "start_year": 2006,
+    "end_year": 2023
+  },
+  {
+    "_id": "518f2ad9-bb65-4228-8d4c-7a62b9f88599",
+    "title": "Cross-group intangible architecture",
+    "seasons": 1,
+    "episodes": 90,
+    "platform": "HBO",
+    "genres": ["Comedy"],
+    "cast": [
+      "Eric Ryan",
+      "Ashley Ball",
+      "Douglas Barton",
+      "Brian Whitehead",
+      "Michael Greer"
+    ],
+    "start_year": 2018,
+    "end_year": null
+  },
+  {
+    "_id": "d5f9304d-567d-4335-b43c-ec4034d7009f",
+    "title": "Programmable bottom-line monitoring",
+    "seasons": 10,
+    "episodes": 69,
+    "platform": "Hulu",
+    "genres": ["Documentary", "Fantasy"],
+    "cast": [
+      "Mrs. Olivia Booth",
+      "William Murphy",
+      "Patricia Payne",
+      "Lisa Estes",
+      "Jason Martin",
+      "Jeff Greene"
+    ],
+    "start_year": 2011,
+    "end_year": 2024
+  },
+  {
+    "_id": "27718a30-6e42-47ad-8adf-1533b9b8a419",
+    "title": "Multi-lateral multi-tasking contingency",
+    "seasons": 3,
+    "episodes": 89,
+    "platform": "Disney+",
+    "genres": ["Crime"],
+    "cast": ["Elizabeth Lambert", "Corey Hughes", "Melissa Stephens"],
+    "start_year": 2006,
+    "end_year": null
+  },
+  {
+    "_id": "defc7620-3b4e-46ff-a949-bec1af753812",
+    "title": "Focused zero administration migration",
+    "seasons": 9,
+    "episodes": 73,
+    "platform": "Disney+",
+    "genres": ["Documentary", "Drama"],
+    "cast": ["Shane Richardson", "Lisa Cooper", "Samantha Perkins"],
+    "start_year": 2008,
+    "end_year": null
+  },
+  {
+    "_id": "9d6781fb-d095-4a00-932d-3f1fac1b0049",
+    "title": "Horizontal methodical encoding",
+    "seasons": 8,
+    "episodes": 40,
+    "platform": "Netflix",
+    "genres": ["Crime"],
+    "cast": ["Patricia Barrett", "Scott Gonzalez", "Michaela Johnson"],
+    "start_year": 2006,
+    "end_year": null
+  },
+  {
+    "_id": "ac19b1b1-2bf9-4093-83fa-60411aa3f80f",
+    "title": "Enterprise-wide analyzing product",
+    "seasons": 8,
+    "episodes": 61,
+    "platform": "Hulu",
+    "genres": ["Drama"],
+    "cast": ["Christie Waters", "Casey Allen", "Nicole Frank"],
+    "start_year": 2001,
+    "end_year": 2005
+  },
+  {
+    "_id": "2dfd2240-dc9f-439f-9e06-b1ec8de397bf",
+    "title": "Compatible well-modulated extranet",
+    "seasons": 10,
+    "episodes": 89,
+    "platform": "Hulu",
+    "genres": ["Drama"],
+    "cast": [
+      "Pedro Butler",
+      "Christian Hall",
+      "Dawn Gregory",
+      "Shannon Russell",
+      "Omar Mullins",
+      "Ian Ramos"
+    ],
+    "start_year": 2012,
+    "end_year": 2013
+  },
+  {
+    "_id": "94db1534-7163-430e-83e3-6a75bc6aec0f",
+    "title": "User-centric tangible infrastructure",
+    "seasons": 5,
+    "episodes": 11,
+    "platform": "Hulu",
+    "genres": ["Drama"],
+    "cast": [
+      "Deborah Garcia",
+      "Michelle Barajas",
+      "Melissa Reynolds",
+      "Douglas Wilson"
+    ],
+    "start_year": 2001,
+    "end_year": null
+  },
+  {
+    "_id": "65b2213f-a606-42d8-b845-0199ba2e9b82",
+    "title": "Inverse optimal circuit",
+    "seasons": 1,
+    "episodes": 29,
+    "platform": "Amazon Prime",
+    "genres": ["Fantasy", "Documentary"],
+    "cast": [
+      "Grace Rodriguez",
+      "Alison Greene",
+      "Michael Allen",
+      "Steven Hayden"
+    ],
+    "start_year": 2013,
+    "end_year": null
+  },
+  {
+    "_id": "5a8a2745-e57c-4086-aa09-84131f40149f",
+    "title": "Public-key discrete alliance",
+    "seasons": 9,
+    "episodes": 111,
+    "platform": "Disney+",
+    "genres": ["Documentary"],
+    "cast": [
+      "Emily Irwin",
+      "Olivia Gibson",
+      "Jean Hernandez",
+      "Michael Cummings"
+    ],
+    "start_year": 2013,
+    "end_year": 2022
+  },
+  {
+    "_id": "51326558-2080-4615-a583-b4f2fbd15600",
+    "title": "Managed zero administration groupware",
+    "seasons": 8,
+    "episodes": 108,
+    "platform": "Hulu",
+    "genres": ["Drama", "Crime"],
+    "cast": [
+      "Karen Phillips",
+      "Kelly Marsh",
+      "Daniel Hamilton",
+      "Abigail Smith"
+    ],
+    "start_year": 2018,
+    "end_year": 2019
+  },
+  {
+    "_id": "87a2cd5f-75ee-4650-b2a4-a56384c97137",
+    "title": "Reverse-engineered static initiative",
+    "seasons": 6,
+    "episodes": 66,
+    "platform": "Amazon Prime",
+    "genres": ["Crime", "Documentary"],
+    "cast": [
+      "Bradley Chavez",
+      "Catherine Horn",
+      "Joseph Bryant",
+      "Tara Rodriguez"
+    ],
+    "start_year": 2003,
+    "end_year": 2006
+  },
+  {
+    "_id": "0f647458-d09f-4be8-b1dc-49be1ba1e104",
+    "title": "Fundamental tangible matrices",
+    "seasons": 9,
+    "episodes": 22,
+    "platform": "Hulu",
+    "genres": ["Drama"],
+    "cast": ["Eric Lee", "Patrick Estrada", "Kelsey Brown", "Jeffrey Lewis"],
+    "start_year": 2001,
+    "end_year": null
+  },
+  {
+    "_id": "53d34237-0e86-4a5e-922b-0589c2e65458",
+    "title": "Self-enabling homogeneous infrastructure",
+    "seasons": 5,
+    "episodes": 35,
+    "platform": "Hulu",
+    "genres": ["Crime"],
+    "cast": [
+      "Chad Torres",
+      "Mark Williams",
+      "Terry Mcguire",
+      "Kathleen Cantu",
+      "Harold Knapp"
+    ],
+    "start_year": 2006,
+    "end_year": null
+  },
+  {
+    "_id": "71cc1515-ba84-4df6-92db-55af3cfa91f0",
+    "title": "Horizontal web-enabled application",
+    "seasons": 2,
+    "episodes": 94,
+    "platform": "Netflix",
+    "genres": ["Thriller", "Fantasy"],
+    "cast": [
+      "Catherine Davila",
+      "Jessica James",
+      "Cory Miller",
+      "Alexis Sanchez",
+      "Andrew Miller"
+    ],
+    "start_year": 2002,
+    "end_year": 2017
+  },
+  {
+    "_id": "200556f7-10c6-4414-83f7-24ef74bff12a",
+    "title": "User-friendly bi-directional data-warehouse",
+    "seasons": 2,
+    "episodes": 87,
+    "platform": "Hulu",
+    "genres": ["Drama", "Fantasy"],
+    "cast": [
+      "Tiffany Brown",
+      "Christina Morales",
+      "Samuel Blake",
+      "Stephanie Johnson",
+      "Wesley Deleon"
+    ],
+    "start_year": 2020,
+    "end_year": null
+  },
+  {
+    "_id": "613832c9-5307-4c80-9dde-3eab4e5aa770",
+    "title": "Pre-emptive leadingedge capacity",
+    "seasons": 5,
+    "episodes": 56,
+    "platform": "Netflix",
+    "genres": ["Comedy"],
+    "cast": ["James Durham", "Jessica Myers", "Rachel King"],
+    "start_year": 2005,
+    "end_year": null
+  },
+  {
+    "_id": "f9cb1076-3eaf-41d2-84df-057d27c1a544",
+    "title": "Fundamental intangible contingency",
+    "seasons": 4,
+    "episodes": 99,
+    "platform": "Disney+",
+    "genres": ["Crime", "Fantasy"],
+    "cast": [
+      "Robert Foster",
+      "Jill Barton",
+      "Kimberly Simmons",
+      "Tracey Gomez"
+    ],
+    "start_year": 2017,
+    "end_year": 2020
+  },
+  {
+    "_id": "f96b112f-943e-43cd-90f0-56725cfa7e59",
+    "title": "Diverse asymmetric forecast",
+    "seasons": 9,
+    "episodes": 24,
+    "platform": "Amazon Prime",
+    "genres": ["Drama", "Crime"],
+    "cast": [
+      "Carl Johnson",
+      "Douglas Beck",
+      "Kevin Guerra",
+      "Taylor Wilson",
+      "Eric Jarvis",
+      "Sarah Charles MD"
+    ],
+    "start_year": 2007,
+    "end_year": null
+  },
+  {
+    "_id": "78eb682f-a03d-4cbf-bbfc-0e899e5f50d0",
+    "title": "Profit-focused solution-oriented Graphical User Interface",
+    "seasons": 10,
+    "episodes": 117,
+    "platform": "HBO",
+    "genres": ["Crime", "Fantasy"],
+    "cast": ["Carol Miller", "Jennifer Bass", "Melanie Leblanc"],
+    "start_year": 2002,
+    "end_year": null
+  },
+  {
+    "_id": "ebb6d3c9-3c98-4799-94bc-aadd0bf2974c",
+    "title": "Reduced leadingedge system engine",
+    "seasons": 1,
+    "episodes": 58,
+    "platform": "Hulu",
+    "genres": ["Crime", "Drama"],
+    "cast": [
+      "James Warren",
+      "Kelly Carter",
+      "Sarah Jones",
+      "Aaron Castaneda",
+      "Katherine Manning"
+    ],
+    "start_year": 2011,
+    "end_year": null
+  },
+  {
+    "_id": "4ffd32a7-0bf4-4c95-a7c8-19002c2eb83c",
+    "title": "Switchable 24/7 website",
+    "seasons": 6,
+    "episodes": 71,
+    "platform": "Netflix",
+    "genres": ["Documentary"],
+    "cast": [
+      "Sarah Brown",
+      "Patrick Beck",
+      "Angela Herrera MD",
+      "Steven Mcconnell"
+    ],
+    "start_year": 2018,
+    "end_year": null
+  },
+  {
+    "_id": "37267325-4337-4912-992f-a162f9014569",
+    "title": "Synergized asymmetric adapter",
+    "seasons": 4,
+    "episodes": 16,
+    "platform": "Hulu",
+    "genres": ["Fantasy"],
+    "cast": ["Gabrielle Meyer", "Madison Matthews", "Taylor Martinez"],
+    "start_year": 2010,
+    "end_year": null
+  },
+  {
+    "_id": "ea2abd77-c7da-443e-89fd-6f410f5d697e",
+    "title": "Extended contextually-based customer loyalty",
+    "seasons": 1,
+    "episodes": 79,
+    "platform": "Hulu",
+    "genres": ["Fantasy"],
+    "cast": ["Michael Lewis", "Cassandra Hicks", "Sydney Garcia"],
+    "start_year": 2015,
+    "end_year": 2023
+  },
+  {
+    "_id": "b568dd56-c083-4431-a740-4f4b5f4e1b21",
+    "title": "Versatile grid-enabled application",
+    "seasons": 7,
+    "episodes": 82,
+    "platform": "Hulu",
+    "genres": ["Crime", "Fantasy"],
+    "cast": ["Keith Brown", "Annette Johnson", "Joseph Carroll", "Derek Lewis"],
+    "start_year": 2006,
+    "end_year": 2008
+  },
+  {
+    "_id": "b6f2e1c3-6915-4e02-b1c2-44b5bec8fd68",
+    "title": "Operative optimizing encryption",
+    "seasons": 2,
+    "episodes": 52,
+    "platform": "Amazon Prime",
+    "genres": ["Fantasy", "Drama"],
+    "cast": [
+      "Garrett Mcgrath",
+      "Craig Jackson",
+      "Michael Sullivan",
+      "Andrew Boyer"
+    ],
+    "start_year": 2011,
+    "end_year": null
+  },
+  {
+    "_id": "51c225d5-aa67-4b14-aca5-33757cef6bf4",
+    "title": "Business-focused 24/7 collaboration",
+    "seasons": 1,
+    "episodes": 113,
+    "platform": "Netflix",
+    "genres": ["Thriller", "Comedy"],
+    "cast": ["Matthew Hill", "Andrew White", "Grant Young", "John Mathews"],
+    "start_year": 2015,
+    "end_year": 2020
+  },
+  {
+    "_id": "7465e69f-341e-4234-8ffb-400622442a40",
+    "title": "Organized bi-directional application",
+    "seasons": 3,
+    "episodes": 40,
+    "platform": "Netflix",
+    "genres": ["Comedy"],
+    "cast": [
+      "Matthew Gordon",
+      "Mark Allen",
+      "Amanda Webb",
+      "Jeffrey Horton",
+      "Sheila Lewis",
+      "Marcus Gilbert"
+    ],
+    "start_year": 2011,
+    "end_year": null
+  },
+  {
+    "_id": "90570eac-f923-4c30-a5b0-661b28a8e4a5",
+    "title": "Configurable bottom-line success",
+    "seasons": 10,
+    "episodes": 106,
+    "platform": "HBO",
+    "genres": ["Fantasy", "Drama"],
+    "cast": [
+      "Elizabeth Taylor",
+      "Melissa Mullins",
+      "Alan Nguyen",
+      "Carolyn Kidd",
+      "Michael Pope"
+    ],
+    "start_year": 2015,
+    "end_year": null
+  },
+  {
+    "_id": "06d70791-5487-4dab-8b84-a91b3376e396",
+    "title": "Organic dedicated analyzer",
+    "seasons": 3,
+    "episodes": 88,
+    "platform": "HBO",
+    "genres": ["Thriller", "Drama"],
+    "cast": ["Amy Aguilar", "James Williams", "Kevin Kirby"],
+    "start_year": 2010,
+    "end_year": 2025
+  }
+]
diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts
new file mode 100644
index 00000000..86f96705
--- /dev/null
+++ b/tests/accuracy/update-many.test.ts
@@ -0,0 +1,54 @@
+import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
+import { getAvailableModels } from "./sdk/models.js";
+import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
+
+function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "update-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    update: {
+                        $set: {
+                            new_field: 1,
+                        },
+                    },
+                },
+            },
+        ],
+    };
+}
+
+function callsUpdateManyWithFilters(prompt: string, filter: Record<string, unknown>): AccuracyTestConfig {
+    return {
+        prompt: prompt,
+        expectedToolCalls: [
+            {
+                toolName: "update-many",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    filter,
+                    update: {
+                        $set: {
+                            new_field: 1,
+                        },
+                    },
+                },
+            },
+        ],
+    };
+}
+
+describeAccuracyTests(getAvailableModels(), [
+    callsUpdateManyWithEmptyFilters(
+        "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1"
+    ),
+    callsUpdateManyWithFilters(
+        "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1",
+        { runtime: { $lt: 100 } }
+    ),
+]);
diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts
index 935b27db..8df9b059 100644
--- a/tests/integration/tools/mongodb/mongodbHelpers.ts
+++ b/tests/integration/tools/mongodb/mongodbHelpers.ts
@@ -2,12 +2,37 @@ import { MongoCluster } from "mongodb-runner";
 import path from "path";
 import { fileURLToPath } from "url";
 import fs from "fs/promises";
-import { MongoClient, ObjectId } from "mongodb";
+import { Document, MongoClient, ObjectId } from "mongodb";
 import { getResponseContent, IntegrationTest, setupIntegrationTest, defaultTestConfig } from "../../helpers.js";
 import { UserConfig } from "../../../../src/config.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
+const testDataDumpPath = path.join(__dirname, "..", "..", "..", "accuracy", "test-data-dumps");
+
+const testDataPaths = [
+    {
+        db: "comics",
+        collection: "books",
+        path: path.join(testDataDumpPath, "comics.books.json"),
+    },
+    {
+        db: "comics",
+        collection: "characters",
+        path: path.join(testDataDumpPath, "comics.characters.json"),
+    },
+    {
+        db: "mflix",
+        collection: "movies",
+        path: path.join(testDataDumpPath, "mflix.movies.json"),
+    },
+    {
+        db: "mflix",
+        collection: "shows",
+        path: path.join(testDataDumpPath, "mflix.shows.json"),
+    },
+];
+
 interface MongoDBIntegrationTest {
     mongoClient: () => MongoClient;
     connectionString: () => string;
@@ -169,3 +194,41 @@ export function validateAutoConnectBehavior(
         });
     });
 }
+
+export function prepareTestData(integration: MongoDBIntegrationTest) {
+    const NON_TEST_DBS = ["admin", "config", "local"];
+    const testData: {
+        db: string;
+        collection: string;
+        data: Document[];
+    }[] = [];
+
+    beforeAll(async () => {
+        for (const { db, collection, path } of testDataPaths) {
+            testData.push({
+                db,
+                collection,
+                data: JSON.parse(await fs.readFile(path, "utf8")) as Document[],
+            });
+        }
+    });
+
+    return {
+        async populateTestData(this: void) {
+            const client = integration.mongoClient();
+            for (const { db, collection, data } of testData) {
+                await client.db(db).collection(collection).insertMany(data);
+            }
+        },
+        async cleanupTestDatabases(this: void, integration: MongoDBIntegrationTest) {
+            const client = integration.mongoClient();
+            const admin = client.db().admin();
+            const databases = await admin.listDatabases();
+            await Promise.all(
+                databases.databases
+                    .filter(({ name }) => !NON_TEST_DBS.includes(name))
+                    .map(({ name }) => client.db(name).dropDatabase())
+            );
+        },
+    };
+}
diff --git a/tests/unit/accuracy-scorer.test.ts b/tests/unit/accuracy-scorer.test.ts
new file mode 100644
index 00000000..60a389d7
--- /dev/null
+++ b/tests/unit/accuracy-scorer.test.ts
@@ -0,0 +1,199 @@
+import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracy-scorer.js";
+import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js";
+
+describe("calculateToolCallingAccuracy", () => {
+    describe("edge cases", () => {
+        it("should return 1 when both expected and actual are empty", () => {
+            const result = calculateToolCallingAccuracy([], []);
+            expect(result).toBe(1);
+        });
+
+        it("should return 0.75 when expected is empty but actual has tool calls", () => {
+            const actualToolCalls: LLMToolCall[] = [{ toolCallId: "1", toolName: "find", parameters: { db: "test" } }];
+            const result = calculateToolCallingAccuracy([], actualToolCalls);
+            expect(result).toBe(0.75);
+        });
+
+        it("should return 0 when expected has tool calls but actual is empty", () => {
+            const expectedToolCalls: ExpectedToolCall[] = [{ toolName: "find", parameters: { db: "test" } }];
+            const result = calculateToolCallingAccuracy(expectedToolCalls, []);
+            expect(result).toBe(0);
+        });
+    });
+
+    describe("perfect matches", () => {
+        it("should return 1 for exact match with nested parameters", () => {
+            const expected: ExpectedToolCall[] = [
+                {
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } },
+                },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } },
+                },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(1);
+        });
+
+        it("should return 1 for exact match with multiple diverse tool calls", () => {
+            const expected: ExpectedToolCall[] = [
+                { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } },
+                {
+                    toolName: "aggregate",
+                    parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] },
+                },
+                { toolName: "count", parameters: { db: "test", collection: "products" } },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active" } },
+                },
+                {
+                    toolCallId: "2",
+                    toolName: "aggregate",
+                    parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] },
+                },
+                { toolCallId: "3", toolName: "count", parameters: { db: "test", collection: "products" } },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(1);
+        });
+    });
+
+    describe("additional parameters", () => {
+        it("should return 0.75 when tool call has additional nested parameters", () => {
+            const expected: ExpectedToolCall[] = [
+                { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: {
+                        db: "test",
+                        collection: "users",
+                        filter: { status: "active", age: { $gte: 18 } },
+                        limit: 10,
+                    },
+                },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0.75);
+        });
+    });
+
+    describe("missing or incorrect parameters", () => {
+        it("should return 0 when tool call has missing nested parameters", () => {
+            const expected: ExpectedToolCall[] = [
+                {
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } },
+                },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active" } },
+                },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0);
+        });
+
+        it("should return 0 when aggregate tool call has incorrect pipeline", () => {
+            const expected: ExpectedToolCall[] = [
+                {
+                    toolName: "aggregate",
+                    parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] },
+                },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "aggregate",
+                    parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $lt: 50 } } }] },
+                },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0);
+        });
+    });
+
+    describe("additional tool calls", () => {
+        it("should cap accuracy at 0.75 when LLM calls extra tools", () => {
+            const expected: ExpectedToolCall[] = [
+                { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active" } },
+                },
+                { toolCallId: "2", toolName: "count", parameters: { db: "test", collection: "orders" } },
+                {
+                    toolCallId: "3",
+                    toolName: "aggregate",
+                    parameters: {
+                        db: "test",
+                        collection: "products",
+                        pipeline: [{ $group: { _id: "$category", total: { $sum: 1 } } }],
+                    },
+                },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0.75);
+        });
+
+        it("should cap accuracy at 0.75 when LLM calls same tool multiple times with variations", () => {
+            const expected: ExpectedToolCall[] = [
+                { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active" } },
+                },
+                {
+                    toolCallId: "2",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } },
+                },
+                { toolCallId: "3", toolName: "find", parameters: { db: "test", collection: "users", limit: 10 } },
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0.75);
+        });
+    });
+
+    describe("missing tool calls", () => {
+        it("should return 0 if any expected tool call was not called", () => {
+            const expected: ExpectedToolCall[] = [
+                { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } },
+                {
+                    toolName: "aggregate",
+                    parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] },
+                },
+            ];
+            const actual: LLMToolCall[] = [
+                {
+                    toolCallId: "1",
+                    toolName: "find",
+                    parameters: { db: "test", collection: "users", filter: { status: "active" } },
+                },
+                // Missing the aggregate tool call
+            ];
+            const result = calculateToolCallingAccuracy(expected, actual);
+            expect(result).toBe(0); // One expected tool call was not called
+        });
+    });
+});