From 8439ed950e1d8f2c7e3b45891d174fcd4c7efd6d Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Thu, 12 Sep 2024 14:41:55 -0700
Subject: [PATCH 1/7] docs: baseline for new pages on TVF / TVI / named to be
 determined

---
 guides/using-trieve-vector-inference.mdx   | 105 ++++++++++++
 mint.json                                  |  26 ++-
 vector-inference/aws-installation.mdx      | 177 +++++++++++++++++++++
 vector-inference/dense.mdx                 |  45 ++++++
 vector-inference/embed.mdx                 | 113 +++++++++++++
 vector-inference/embed_all.mdx             | 114 +++++++++++++
 vector-inference/embed_sparse.mdx          | 127 +++++++++++++++
 vector-inference/introduction.mdx          |  29 ++++
 vector-inference/openai.mdx                |   3 +
 vector-inference/openai_compat.mdx         | 134 ++++++++++++++++
 vector-inference/rerank.mdx                |  42 +++++
 vector-inference/reranker.mdx              | 140 ++++++++++++++++
 vector-inference/splade.mdx                |  47 ++++++
 vector-inference/working-with-reranker.mdx |   0
 14 files changed, 1099 insertions(+), 3 deletions(-)
 create mode 100644 guides/using-trieve-vector-inference.mdx
 create mode 100644 vector-inference/aws-installation.mdx
 create mode 100644 vector-inference/dense.mdx
 create mode 100644 vector-inference/embed.mdx
 create mode 100644 vector-inference/embed_all.mdx
 create mode 100644 vector-inference/embed_sparse.mdx
 create mode 100644 vector-inference/introduction.mdx
 create mode 100644 vector-inference/openai.mdx
 create mode 100644 vector-inference/openai_compat.mdx
 create mode 100644 vector-inference/rerank.mdx
 create mode 100644 vector-inference/reranker.mdx
 create mode 100644 vector-inference/splade.mdx
 create mode 100644 vector-inference/working-with-reranker.mdx

diff --git a/guides/using-trieve-vector-inference.mdx b/guides/using-trieve-vector-inference.mdx
new file mode 100644
index 0000000..85a3acc
--- /dev/null
+++ b/guides/using-trieve-vector-inference.mdx
@@ -0,0 +1,105 @@
+---
+title: 'Install Trieve vector inference'
+description: 'Install Trieve Vector Inference'
+icon: 'files'
+---
+
+## Installation Requirements
+
+- `eksctl` >= 0.171 ([eksctl installation guide](https://eksctl.io/installation))
+- `aws` >= 2.15 ([aws installation guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html))
+- `kubectl` >= 1.28 ([kubectl installation guide](https://kubernetes.io/docs/tasks/tools/#kubectl))
+- `helm` >= 3.14 ([helm installation guide](https://helm.sh/docs/intro/install/#helm))
+
+You'll also need a license to run Trieve Vector Inference
+
+### Getting your license
+
+(contact us here)
+
+## Check AWS quota
+
+Ensure you have quotas for
+1) At least **4 vCPUs** for On-Demand G and VT instances in the region of choice.
+
+Check quota for *us-east-2* [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF)
+
+- At least **1 load-balancer per each model you want.
+
+Check quota for *us-east-2* [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF)
+
+## Deploying the Cluster
+
+### Setting up environment variables 
+
+Create eks cluster and install needed plugins
+
+Your AWS Account ID:
+```sh
+export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)"
+```
+
+Your AWS REGION:
+```sh
+export AWS_REGION=us-east-2
+```
+
+Your Kubernetes cluster name:
+
+```sh
+export CLUSTER_NAME=trieve-gpu
+```
+
+Your machine types, we recommend `g4dn.xlarge`, as it is the cheapest on AWS. A single small node is needed for extra utility.
+
+```sh
+export CPU_INSTANCE_TYPE=t3.small
+export GPU_INSTANCE_TYPE=g4dn.xlarge
+export GPU_COUNT=1
+```
+
+### Create your cluster
+
+```sh
+curl ./create_cluster.sh | sh
+```
+
+This will take around 25 minutes to complete
+
+## Install Trieve Vector Inference
+
+### Specify your embedding models
+
+Modify `embedding_models.yaml` for the models that you want to use
+
+### Install the helm chart
+
+```sh
+helm upgrade -i vector-inference oci://registry-1.docker.io/trieve/embeddings-helm -f embedding_models.yaml
+```
+
+### Get your model endpoints
+
+```sh
+kubectl get ingress
+```
+
+![](./assets/ingress.png)
+
+## Using Trieve Vector Inference
+
+```sh
+curl -X POST   -H "Content-Type: application/json"   -d '{"inputs": "cancer" ,"model": "en"}
+```
+
+## Optional: Delete the cluster
+
+```sh
+cluster_name=trieve-gpu
+region=us-east-2
+
+helm uninstall vector-release
+helm uninstall nvdp -n kube-system
+helm uninstall aws-load-balancer-controller -n kube-system
+eksctl delete cluster --region=${REGION} --name=${CLUSTER_NAME}
+```
diff --git a/mint.json b/mint.json
index 740ad85..7f9ddd0 100644
--- a/mint.json
+++ b/mint.json
@@ -35,6 +35,10 @@
     {
       "name": "API Reference",
       "url": "api-reference"
+    },
+    {
+      "name": "Vector Inference",
+      "url": "vector-inference"
     }
   ],
   "anchors": [
@@ -66,7 +70,8 @@
         "getting-started/introduction",
         "getting-started/quickstart",
         "getting-started/trieve-primitives",
-        "getting-started/screenshots"
+        "getting-started/screenshots",
+        "vector-inference/introduction"
       ]
     },
     {
@@ -75,7 +80,8 @@
         "self-hosting/docker-compose",
         "self-hosting/local-kube",
         "self-hosting/aws",
-        "self-hosting/gcp"
+        "self-hosting/gcp",
+        "vector-inference/aws-installation"
       ]
     },
     {
@@ -85,7 +91,21 @@
         "guides/uploading-files",
         "guides/searching-with-trieve",
         "guides/recommending-with-trieve",
-        "guides/RAG-with-trieve"
+        "guides/RAG-with-trieve",
+        "vector-inference/rerank",
+        "vector-inference/splade",
+        "vector-inference/dense",
+        "vector-inference/openai"
+      ]
+    },
+    {
+      "group": "API Reference",
+      "pages": [
+        "vector-inference/embed",
+        "vector-inference/embed_all",
+        "vector-inference/embed_sparse",
+        "vector-inference/reranker",
+        "vector-inference/openai_compat"
       ]
     },
     {
diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx
new file mode 100644
index 0000000..0d539d8
--- /dev/null
+++ b/vector-inference/aws-installation.mdx
@@ -0,0 +1,177 @@
+---
+title: 'AWS Installation'
+description: 'Install Trieve Vector Inference'
+icon: 'aws'
+---
+
+## Installation Requirements
+
+- `eksctl` >= 0.171 ([eksctl installation guide](https://eksctl.io/installation))
+- `aws` >= 2.15 ([aws installation guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html))
+- `kubectl` >= 1.28 ([kubectl installation guide](https://kubernetes.io/docs/tasks/tools/#kubectl))
+- `helm` >= 3.14 ([helm installation guide](https://helm.sh/docs/intro/install/#helm))
+
+You'll also need a license to run Trieve Vector Inference
+
+### Getting your license
+
+(contact us here)
+
+## Check AWS quota
+
+<Warning>
+    Ensure you have quotas for Both GPU's and Load Balancers.
+</Warning>
+
+1) At least **4 vCPUs** for On-Demand G and VT instances in the region of choice.
+
+Check quota for [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF)
+
+2) You will need **1 load-balancer** per each model you want.
+
+Check quotas for [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF)
+
+## Deploying the Cluster
+
+### Setting up environment variables 
+
+Create eks cluster and install needed plugins
+
+Your AWS Account ID:
+```sh
+export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)"
+```
+
+Your AWS REGION:
+```sh
+export AWS_REGION=us-east-2
+```
+
+Your Kubernetes cluster name:
+
+```sh
+export CLUSTER_NAME=trieve-gpu
+```
+
+Your machine types, we recommend `g4dn.xlarge`, as it is the cheapest on AWS. A single small node is needed for extra utility.
+
+```sh
+export CPU_INSTANCE_TYPE=t3.small
+export GPU_INSTANCE_TYPE=g4dn.xlarge
+export GPU_COUNT=1
+```
+
+**To use our reccomended defaults**
+
+```sh
+export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)"
+export AWS_REGION=us-east-2
+export CLUSTER_NAME=trieve-gpu
+export CPU_INSTANCE_TYPE=t3.small
+export GPU_INSTANCE_TYPE=g4dn.xlarge
+export GPU_COUNT=1
+```
+
+### Create your cluster
+
+Download the bootstrap script
+```sh
+wget cdn.trieve.ai/bootstrap-eks.sh
+```
+
+Run the bootstrap script with bash
+
+```sh
+bash bootstrap-eks.sh
+```
+
+This will take around 25 minutes to complete
+
+## Install Trieve Vector Inference
+
+### Configure `embedding_models.yaml`
+
+Frist download the example configiguration file
+
+```sh
+wget https://cdn.trieve.ai/embedding_models.yaml
+```
+
+Now you can modify your `embedding_models.yaml`, this defines all the models that you will want to use
+
+```yaml embedding_models.yaml
+accessKey: "<your-access-key-here>"
+
+models:
+  bgeM3:
+    replicas: 2
+    revision: main
+    modelName: BAAI/bge-m3 # The end of the URL https://huggingface.co/BAAI/bge-m3
+    hfToken: "" # If you have a private hugging face repo
+  spladeDoc:
+    replicas: 2
+    modelName: naver/efficient-splade-VI-BT-large-doc # The end of the URL https://huggingface.co/naver/efficient-splade-VI-BT-large-doc
+    isSplade: true
+  spladeQuery:
+    replicas: 2
+    modelName: naver/efficient-splade-VI-BT-large-doc # The end of the URL https://huggingface.co/naver/efficient-splade-VI-BT-large-doc
+    isSplade: true
+  bge-reranker:
+    replicas: 2
+    modelName: BAAI/bge-reranker-large
+    isSplade: false
+```
+
+### Install the helm chart
+
+```sh
+helm upgrade -i vector-inference \
+    oci://registry-1.docker.io/trieve/embeddings-helm \
+    -f embedding_models.yaml
+```
+
+### Get your model endpoints
+
+```sh
+kubectl get ingress
+```
+
+The output looks something like this:
+
+```
+NAME                                              CLASS   HOSTS   ADDRESS                                                                  PORTS   AGE
+vector-inference-embedding-bge-reranker-ingress   alb     *       k8s-default-vectorin-18b7ade77a-2040086997.us-east-2.elb.amazonaws.com   80      73s
+vector-inference-embedding-bgem3-ingress          alb     *       k8s-default-vectorin-25e84e25f0-1362792264.us-east-2.elb.amazonaws.com   80      73s
+vector-inference-embedding-spladedoc-ingress      alb     *       k8s-default-vectorin-8af81ad2bd-192706382.us-east-2.elb.amazonaws.com    80      72s
+vector-inference-embedding-spladequery-ingress    alb     *       k8s-default-vectorin-10404abaee-1617952667.us-east-2.elb.amazonaws.com   80      3m20s
+```
+
+## Using Trieve Vector Inference
+
+Each `ingress` point will be using their own Application Load Balancer within AWS. The `Address` provided is the model's endpoint that you can make [dense embeddings](/vector-inference/embed), [sparse embeddings](/vector-inference/embed_sparse), or [reranker calls](/vector-inference/reranker) based on the models you chose
+
+Check out the guides for more information on configureation
+
+<CardGroup cols={3}>
+  <Card icon="magnifying-glass" title="Using Splade Models" href="/vector-inference/splade">
+	How to setup a dedicated instance for the sparse splade embedding model
+  </Card>
+  <Card icon="brackets-curly" title="Using Custom Models" href="/vector-inference/dense">
+	How to use private or gated hugging face models. Or any models that you want
+  </Card>
+  <Card icon="microchip-ai" title="OpenAI compatibility" href="/vector-inference/openai">
+	Trieve Vector Inference has openai compatible routes
+  </Card>
+</CardGroup>
+
+## Optional: Delete the cluster
+
+```sh
+cluster_name=trieve-gpu
+region=us-east-2
+
+helm uninstall vector-release
+helm uninstall nvdp -n kube-system
+helm uninstall aws-load-balancer-controller -n kube-system
+eksctl delete cluster --region=${REGION} --name=${CLUSTER_NAME}
+```
diff --git a/vector-inference/dense.mdx b/vector-inference/dense.mdx
new file mode 100644
index 0000000..b476c3d
--- /dev/null
+++ b/vector-inference/dense.mdx
@@ -0,0 +1,45 @@
+---
+title: 'Using Custom Models'
+---
+
+## Custom or fine tuned models in Trieve Vector Inference
+
+The [open source text models](https://huggingface.co/spaces/mteb/leaderboard) on hugging face may not be what you always want, 
+
+
+<Steps>
+<Step title="Update embedding_models.yaml">
+To use a private or custom model with Trieve Vector Inference, you will need to update your `embedding_models.yaml` file.
+
+If the model is a private or gated hugging face model, you will need to include your huggingface api token
+
+```yaml embedding_models.yaml
+...
+models:
+  ...
+  my-custom-model:
+    replicas: 1
+    revision: main
+    modelName: trieve/private-model-example
+	hfToken: "hf_**********************************"
+...
+```
+</Step>
+
+<Step title="Update your TVI cluster">
+Update TVI to include your models
+
+```bash
+helm upgrade -i vector-inference \
+    oci://registry-1.docker.io/trieve/embeddings-helm \
+    -f embedding_models.yaml
+```
+</Step>
+
+<Step title="Get embeddings endpoint">
+```sh
+kubectl get ing
+```
+</Step>
+</Steps>
+
diff --git a/vector-inference/embed.mdx b/vector-inference/embed.mdx
new file mode 100644
index 0000000..07248bc
--- /dev/null
+++ b/vector-inference/embed.mdx
@@ -0,0 +1,113 @@
+---
+title: 'Create Embedding'
+sidebarTitle: 'POST /embed'
+description: 'Get Embeddings. Returns a 424 status code if the model is not an embedding model'
+---
+
+Generating an embedding from a dense embedding model
+
+<RequestExample>
+
+```json RAW Json
+{
+  "inputs": "The model input",
+  "prompt_name": null,
+  "normalize": true,
+  "truncate": false,
+  "truncation_direction": "right"
+}
+```
+
+```sh curl
+curl -X POST \
+     -H "Content-Type: application/json"\
+     -d '{"inputs": "test input"}' \
+     --url http://<your-url>/embed 
+```
+
+```py python
+import requests
+
+endpoint = "<your-custom-endpoint>"
+
+requests.post(f"{endpoint}/embed", json={
+    "inputs": ["test input", "test input 2"]
+});
+
+## or 
+
+requests.post(f"{endpoint}/embed", json={
+    "inputs": "test single input"
+});
+```
+
+</RequestExample>
+
+<ResponseExample>
+```json 200 Embeddings
+[
+  [
+    0.038483415,
+    -0.00076982786,
+    -0.020039458
+    ...
+  ],
+  [
+    0.04496114,
+    -0.039057795,
+    -0.022400795,
+    ...
+  ]
+]
+```
+
+```json 413
+{
+    "error": "Batch size error",
+    "error_type": "validation"
+}
+```
+
+```json 422
+{
+    "error": "Tokenization error",
+    "error_type": "validation"
+}
+```
+
+```json 424
+{
+    "error": "Inference failed",
+    "error_type": "backend"
+}
+```
+
+```json 429
+{
+    "error": "Model is overloaded",
+    "error_type": "overloaded"
+}
+```
+</ResponseExample>
+
+<ParamField path="inputs" type="string | string[]" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="normalize" type="boolean" default="true">
+</ParamField>
+
+<ParamField path="prompt_name" type="string">
+The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+
+Must be a key in the `sentence-transformers` configuration prompts dictionary.
+
+For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode.
+</ParamField>
+
+<ParamField path="truncate" type="boolean" default="false">
+Automatically truncate inputs that are longer than the maximum supported size
+</ParamField>
+
+<ParamField path="truncate_direction" type='"right" | "left"' default="right">
+</ParamField>
diff --git a/vector-inference/embed_all.mdx b/vector-inference/embed_all.mdx
new file mode 100644
index 0000000..5e37c47
--- /dev/null
+++ b/vector-inference/embed_all.mdx
@@ -0,0 +1,114 @@
+---
+title: 'Create Embedding'
+sidebarTitle: 'POST /embed_all'
+description: 'Get Embeddings. Returns a 424 status code if the model is not an embedding model'
+---
+
+Generating an embedding from a dense embedding model
+
+<RequestExample>
+
+```json RAW Json
+{
+  "inputs": "The model input",
+  "prompt_name": null,
+  "truncate": false,
+  "truncation_direction": "right"
+}
+```
+
+```sh curl
+curl -X POST \
+     -H "Content-Type: application/json"\
+     -d '{"inputs": "test input"}' \
+     --url http://<your-url>/embed_all 
+```
+
+```py python
+import requests
+
+endpoint = "<your-custom-endpoint>"
+
+requests.post(f"{endpoint}/embed_all", json={
+    "inputs": ["test input", "test input 2"]
+});
+
+## or 
+
+requests.post(f"{endpoint}/embed_all", json={
+    "inputs": "test single input"
+});
+
+
+```
+
+</RequestExample>
+
+<ResponseExample>
+```json 200 Embeddings
+[
+  [
+    0.038483415,
+    -0.00076982786,
+    -0.020039458
+    ...
+  ],
+  [
+    0.04496114,
+    -0.039057795,
+    -0.022400795,
+    ...
+  ]
+]
+```
+
+```json 413
+{
+    "error": "Batch size error",
+    "error_type": "validation"
+}
+```
+
+```json 422
+{
+    "error": "Tokenization error",
+    "error_type": "validation"
+}
+```
+
+```json 424
+{
+    "error": "Inference failed",
+    "error_type": "backend"
+}
+```
+
+```json 429
+{
+    "error": "Model is overloaded",
+    "error_type": "overloaded"
+}
+```
+</ResponseExample>
+
+
+
+<ParamField path="inputs" type="string | string[]" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="prompt_name" type="string">
+The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+
+Must be a key in the `sentence-transformers` configuration prompts dictionary.
+
+For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode.
+</ParamField>
+
+<ParamField path="truncate" type="boolean" default="false">
+Automatically truncate inputs that are longer than the maximum supported size
+</ParamField>
+
+<ParamField path="truncate_direction" type='"right" | "left"' default="right">
+</ParamField>
+
diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx
new file mode 100644
index 0000000..44a7c98
--- /dev/null
+++ b/vector-inference/embed_sparse.mdx
@@ -0,0 +1,127 @@
+---
+title: 'Create Sparse Embedding'
+sidebarTitle: 'POST /embed_sparse'
+description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a splade embedding model'
+---
+
+Generating an embedding from a sparse embedding model.
+The main one that we support right now are the splade models
+
+<RequestExample>
+
+```json RAW Json
+{
+  "inputs": "The model input",
+  "prompt_name": null,
+  "truncate": false,
+  "truncation_direction": "right"
+}
+```
+
+```sh curl
+curl -X POST \
+     -H "Content-Type: application/json"\
+     -d '{"inputs": "test input"}' \
+     --url http://<your-url>/embed_sparse
+```
+
+```py python
+import requests
+
+endpoint = "<your-custom-endpoint>"
+
+requests.post(f"{endpoint}/embed_sparse", json={
+    "inputs": ["test input", "test input 2"]
+});
+
+## or 
+
+requests.post(f"{endpoint}/embed_sparse", json={
+    "inputs": "test single input"
+});
+
+
+```
+
+</RequestExample>
+
+<ResponseExample>
+```json 200 Embeddings
+[
+    // Embedding 1
+    [
+        {
+            "index": 1012,
+            "value": 0.9970703
+        },
+        {
+            "index": 4456,
+            "value": 2.7832031
+        }
+    ],
+    // Embedding 2
+    [
+        {
+            "index": 990,
+            "value": 2.783203
+        },
+        {
+            "index": 3021,
+            "value": 10.9970703
+        },
+        ...
+    ],
+    ...
+]
+```
+
+```json 413
+{
+    "error": "Batch size error",
+    "error_type": "validation"
+}
+```
+
+```json 422
+{
+    "error": "Tokenization error",
+    "error_type": "validation"
+}
+```
+
+```json 424
+{
+    "error": "Inference failed",
+    "error_type": "backend"
+}
+```
+
+```json 429
+{
+    "error": "Model is overloaded",
+    "error_type": "overloaded"
+}
+```
+</ResponseExample>
+
+
+
+<ParamField path="inputs" type="string | string[]" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="prompt_name" type="string">
+The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+
+Must be a key in the `sentence-transformers` configuration prompts dictionary.
+
+For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode.
+</ParamField>
+
+<ParamField path="truncate" type="boolean" default="false">
+Automatically truncate inputs that are longer than the maximum supported size
+</ParamField>
+
+<ParamField path="truncate_direction" type='"right" | "left"' default="right">
+</ParamField>
+
diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx
new file mode 100644
index 0000000..c450ed3
--- /dev/null
+++ b/vector-inference/introduction.mdx
@@ -0,0 +1,29 @@
+---
+title: Introduction
+description: Trieve Vector Inference is an on-prem solution for fast vector infernece
+---
+
+## Quick Start
+
+<CardGroup>
+  <Card title="AWS On-Prem Installation" icon="aws" href="/vector-inference/aws-installation">
+    Adding Trieve Vector Inference into your AWS account
+  </Card>
+
+  <Card title="Creating Embeddings" icon="webhook" href="/vector-inference/embed">
+    Using the `/embed` route
+  </Card>
+
+  <Card title="Custom Embedding Models" icon="webhook" href="/vector-inference/dense">
+    Check out the API Reference to see all of the available endpoints for Trieve Vector Inference
+  </Card>
+
+  <Card title="Splade V2" icon="webhook" href="/vector-inference/splade">
+    Check out the API Reference to see all of the available endpoints for Trieve Vector Inference
+  </Card>
+</CardGroup>
+
+
+## Performance Difference
+
+Refer to our benchmarks
diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx
new file mode 100644
index 0000000..f22e933
--- /dev/null
+++ b/vector-inference/openai.mdx
@@ -0,0 +1,3 @@
+---
+title: "OpenAI compatible interface"
+---
diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx
new file mode 100644
index 0000000..a567dd5
--- /dev/null
+++ b/vector-inference/openai_compat.mdx
@@ -0,0 +1,134 @@
+---
+title: 'OpenAI compatible embeddings route'
+sidebarTitle: 'POST /v1/embeddings'
+description: 'OpenAI compatible route. Returns a 424 status code if the model is not an embedding model'
+---
+
+Generating an embedding from a dense embedding model
+
+<RequestExample>
+
+```json Raw JSON
+{
+  "encoding_format": "float",
+  "input": "string",
+  "model": "null",
+  "user": "null"
+}
+```
+
+```sh curl
+curl -X POST \
+     -H "Content-Type: application/json"\
+     -d '{"inputs": "test input"}' \
+     --url http://<your-url>/v1/embeddings 
+```
+
+```py python
+import requests
+
+endpoint = "<your-custom-endpoint>"
+
+requests.post(f"{endpoint}/v1/embeddings", json={
+    "inputs": ["test input", "test input 2"]
+});
+
+## or 
+
+requests.post(f"{endpoint}/v1/embeddings", json={
+    "inputs": "test single input"
+});
+
+
+```
+
+</RequestExample>
+
+<ResponseExample>
+```json 200 Embeddings
+{
+  "data": [
+    {
+      "embedding": [
+        0.038483415,
+        -0.00076982786,
+        -0.020039458
+        ...
+      ],
+      "index": 0,
+      "object": "embedding"
+    },
+    {
+      "embedding": [
+        0.038483415,
+        -0.00076982786,
+        -0.020039458
+        ...
+      ],
+      "index": 1,
+      "object": "embedding"
+    },
+    ...
+  ],
+  "model": "thenlper/gte-base",
+  "object": "list",
+  "usage": {
+    "prompt_tokens": 512,
+    "total_tokens": 512
+  }
+}
+```
+
+```json 413
+{
+    "error": "Batch size error",
+    "error_type": "validation"
+}
+```
+
+```json 422
+{
+    "error": "Tokenization error",
+    "error_type": "validation"
+}
+```
+
+```json 424
+{
+    "error": "Inference failed",
+    "error_type": "backend"
+}
+```
+
+```json 429
+{
+    "error": "Model is overloaded",
+    "error_type": "overloaded"
+}
+```
+</ResponseExample>
+
+
+
+<ParamField path="inputs" type="string | string[]" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="normalize" type="boolean" default="true">
+</ParamField>
+
+<ParamField path="prompt_name" type="string">
+The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.
+
+Must be a key in the `sentence-transformers` configuration prompts dictionary.
+
+For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode.
+</ParamField>
+
+<ParamField path="truncate" type="boolean" default="false">
+Automatically truncate inputs that are longer than the maximum supported size
+</ParamField>
+
+<ParamField path="truncate_direction" type='"right" | "left"' default="right">
+</ParamField>
+
diff --git a/vector-inference/rerank.mdx b/vector-inference/rerank.mdx
new file mode 100644
index 0000000..14ed536
--- /dev/null
+++ b/vector-inference/rerank.mdx
@@ -0,0 +1,42 @@
+---
+title: "Working with Reranker"
+---
+
+## What is a Reranker / CrossEncoder?
+
+`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion
+
+## Using with Trieve Vector Inference
+
+<Steps>
+<Step title="Update embedding_models.yaml">
+To use a reranker model with Trieve Vector Inference, you will need to update your embedding_models.yaml file
+
+```yaml embedding_models.yaml
+...
+models:
+  ...
+  my-reranker-model:
+    replicas: 1
+    revision: main
+    modelName: BAAI/bge-reranker-large
+...
+```
+</Step>
+
+<Step title="Upgrade your TVF cluster">
+Update TVF to include your models
+
+```bash
+helm upgrade -i vector-inference \
+    oci://registry-1.docker.io/trieve/embeddings-helm \
+    -f embedding_models.yaml
+```
+</Step>
+
+<Step title="Get embeddings endpoint">
+```sh
+kubectl get ing
+```
+</Step>
+</Steps>
diff --git a/vector-inference/reranker.mdx b/vector-inference/reranker.mdx
new file mode 100644
index 0000000..1e1e6a2
--- /dev/null
+++ b/vector-inference/reranker.mdx
@@ -0,0 +1,140 @@
+---
+title: 'Get ranks'
+sidebarTitle: 'POST /rerank'
+description: 'Runs Reranker. Returns a 424 status code if the model is not a Reranker model'
+---
+
+<RequestExample>
+
+```json Raw Json
+{
+  "query": "What are some good electric cars",
+  "texts": [
+      "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...",
+      "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..."
+  ],
+  "raw_scores": false,
+  "return_text": false,
+  "truncate": false,
+  "truncation_direction": "right"
+}
+```
+
+```sh curl
+curl -X POST \
+     -H "Content-Type: application/json" \
+     -d '{
+      "query": "What are some good electric cars",
+      "texts": [
+          "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...",
+          "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..."
+      ],
+      "raw_scores": false,
+      "return_text": false,
+      "truncate": false,
+      "truncation_direction": "right"
+    }' \
+     --url http://<your-url>/rerank
+```
+
+```py python
+import requests
+
+endpoint = "<your-custom-endpoint>"
+
+requests.post(f"{endpoint}/rerank", json={
+    "query": "What are some good electric cars",
+    "texts": [
+      "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...",
+      "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..."
+    ],
+    "raw_scores": False,
+    "return_text": False,
+    "truncate": False,
+    "truncation_direction": "right"
+});
+
+## or 
+
+requests.post(f"{endpoint}/rerank", json={
+    "inputs": "test single input"
+});
+
+
+```
+
+</RequestExample>
+
+<ResponseExample>
+```json 200 Ranks
+[
+    {
+        "index":1,
+        "score":0.15253653,
+        // if return_text = true
+        "text": "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..."
+    },
+    {
+        "index":0,
+        "score":0.00498227
+        // if return_text = true
+        "text": "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ..."
+    }
+]
+```
+
+```json 413
+{
+    "error": "Batch size error",
+    "error_type": "validation"
+}
+```
+
+```json 422
+{
+    "error": "Tokenization error",
+    "error_type": "validation"
+}
+```
+
+```json 424
+{
+    "error": "Inference failed",
+    "error_type": "backend"
+}
+```
+
+```json 429
+{
+    "error": "Model is overloaded",
+    "error_type": "overloaded"
+}
+```
+</ResponseExample>
+
+
+
+<ParamField path="query" type="string" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="texts" type="string[]" required>
+  Inputs that need to be embedded
+</ParamField>
+
+<ParamField path="raw_scores" type="boolean" default="false">
+  Output the raw reranker score or the normalized score between 0-1.
+  When `false`, score is between 0 and 1, otherwise range is indeterminate 
+</ParamField>
+
+<ParamField path="return_text" type="boolean" default="false">
+  Return the text with along with each rank
+</ParamField>
+
+<ParamField path="truncate" type="boolean" default="false">
+Automatically truncate inputs that are longer than the maximum supported size
+</ParamField>
+
+<ParamField path="truncate_direction" type='"right" | "left"' default="right">
+</ParamField>
+
diff --git a/vector-inference/splade.mdx b/vector-inference/splade.mdx
new file mode 100644
index 0000000..4a74b97
--- /dev/null
+++ b/vector-inference/splade.mdx
@@ -0,0 +1,47 @@
+---
+title: "Working with Splade v2"
+---
+
+## What is splade?
+
+`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion
+
+## Using with Trieve Vector Inference
+
+<Steps>
+<Step title="Update embedding_models.yaml">
+To use splade with Trieve Vector Inference, you will need to adapt both the `doc` and `query` models
+
+The splade `document` model is the model you use to encode files, where the `query` model is the one to encode the query that you will be searching with
+
+```yaml embedding_models.yaml
+models:
+  # ...
+  spladeDoc:
+    replicas: 1
+    modelName: naver/efficient-splade-VI-BT-large-doc
+    isSplade: true
+  spladeQuery:
+    replicas: 1
+    modelName: naver/efficient-splade-VI-BT-large-query
+    isSplade: true
+  # ...
+```
+</Step>
+
+<Step title="Upgrade your TVF cluster">
+Update TVF to include your models
+
+```bash
+helm upgrade -i vector-inference \
+    oci://registry-1.docker.io/trieve/embeddings-helm \
+    -f embedding_models.yaml
+```
+</Step>
+
+<Step title="Get embeddings endpoint">
+```sh
+kubectl get ing
+```
+</Step>
+</Steps>
diff --git a/vector-inference/working-with-reranker.mdx b/vector-inference/working-with-reranker.mdx
new file mode 100644
index 0000000..e69de29

From 0cf6ce6a0dc51dffd46d5c6e20fab45a7fd620b1 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Mon, 16 Sep 2024 10:52:23 -0700
Subject: [PATCH 2/7] fix: spell checks and formatting

---
 vector-inference/aws-installation.mdx      | 23 +++++++++++++++-------
 vector-inference/embed_sparse.mdx          |  4 ++--
 vector-inference/openai_compat.mdx         |  6 +++---
 vector-inference/working-with-reranker.mdx |  0
 4 files changed, 21 insertions(+), 12 deletions(-)
 delete mode 100644 vector-inference/working-with-reranker.mdx

diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx
index 0d539d8..b07d6e2 100644
--- a/vector-inference/aws-installation.mdx
+++ b/vector-inference/aws-installation.mdx
@@ -61,7 +61,13 @@ export GPU_INSTANCE_TYPE=g4dn.xlarge
 export GPU_COUNT=1
 ```
 
-**To use our reccomended defaults**
+Disable AWS CLI pagination (optional):
+
+```sh
+export AWS_PAGER=""
+```
+
+**To use our recommended defaults**
 
 ```sh
 export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)"
@@ -70,16 +76,17 @@ export CLUSTER_NAME=trieve-gpu
 export CPU_INSTANCE_TYPE=t3.small
 export GPU_INSTANCE_TYPE=g4dn.xlarge
 export GPU_COUNT=1
+export AWS_PAGER=""
 ```
 
 ### Create your cluster
 
-Download the bootstrap script
+Download the `bootstrap-eks.sh` script
 ```sh
 wget cdn.trieve.ai/bootstrap-eks.sh
 ```
 
-Run the bootstrap script with bash
+Run `bootstrap-eks.sh` with bash
 
 ```sh
 bash bootstrap-eks.sh
@@ -91,7 +98,7 @@ This will take around 25 minutes to complete
 
 ### Configure `embedding_models.yaml`
 
-Frist download the example configiguration file
+First download the example configuration file
 
 ```sh
 wget https://cdn.trieve.ai/embedding_models.yaml
@@ -150,7 +157,7 @@ vector-inference-embedding-spladequery-ingress    alb     *       k8s-default-ve
 
 Each `ingress` point will be using their own Application Load Balancer within AWS. The `Address` provided is the model's endpoint that you can make [dense embeddings](/vector-inference/embed), [sparse embeddings](/vector-inference/embed_sparse), or [reranker calls](/vector-inference/reranker) based on the models you chose
 
-Check out the guides for more information on configureation
+Check out the guides for more information on configuration
 
 <CardGroup cols={3}>
   <Card icon="magnifying-glass" title="Using Splade Models" href="/vector-inference/splade">
@@ -167,8 +174,10 @@ Check out the guides for more information on configureation
 ## Optional: Delete the cluster
 
 ```sh
-cluster_name=trieve-gpu
-region=us-east-2
+CLUSTER_NAME=trieve-gpu
+REGION=us-east-2
+
+aws eks update-kubeconfig --region ${REGION} --name ${CLUSTER_NAME}
 
 helm uninstall vector-release
 helm uninstall nvdp -n kube-system
diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx
index 44a7c98..2ceee6a 100644
--- a/vector-inference/embed_sparse.mdx
+++ b/vector-inference/embed_sparse.mdx
@@ -1,11 +1,11 @@
 ---
 title: 'Create Sparse Embedding'
 sidebarTitle: 'POST /embed_sparse'
-description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a splade embedding model'
+description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a Splade embedding model'
 ---
 
 Generating an embedding from a sparse embedding model.
-The main one that we support right now are the splade models
+The main one that we support right now are the Splade models
 
 <RequestExample>
 
diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx
index a567dd5..00e0430 100644
--- a/vector-inference/openai_compat.mdx
+++ b/vector-inference/openai_compat.mdx
@@ -20,7 +20,7 @@ Generating an embedding from a dense embedding model
 ```sh curl
 curl -X POST \
      -H "Content-Type: application/json"\
-     -d '{"inputs": "test input"}' \
+     -d '{"input": "test input"}' \
      --url http://<your-url>/v1/embeddings 
 ```
 
@@ -30,13 +30,13 @@ import requests
 endpoint = "<your-custom-endpoint>"
 
 requests.post(f"{endpoint}/v1/embeddings", json={
-    "inputs": ["test input", "test input 2"]
+    "input": ["test input", "test input 2"]
 });
 
 ## or 
 
 requests.post(f"{endpoint}/v1/embeddings", json={
-    "inputs": "test single input"
+    "input": "test single input"
 });
 
 
diff --git a/vector-inference/working-with-reranker.mdx b/vector-inference/working-with-reranker.mdx
deleted file mode 100644
index e69de29..0000000

From d173b3b9a6199d15ae231a4271cda01d52d02d81 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Mon, 16 Sep 2024 10:52:38 -0700
Subject: [PATCH 3/7] docs: openai copmatiblity guide

---
 vector-inference/openai.mdx | 38 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx
index f22e933..e2a0c3c 100644
--- a/vector-inference/openai.mdx
+++ b/vector-inference/openai.mdx
@@ -1,3 +1,39 @@
 ---
-title: "OpenAI compatible interface"
+title: "Using OpenAI SDK"
 ---
+
+Trieve Vector Inference is compatible with the OpenAI api. This means you're able to just replace the endpoint, without changing any pre-existing code.
+Here's an example with the `openai` python sdk
+
+First install the dependencies 
+
+```sh
+pip install openai
+pip install requests
+pip install python-dotenv
+```
+
+```python openai_compatibility.py
+import openai
+import time
+import requests
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+endpoint = "http://<your-ingress-endpoint-here>"
+
+openai.base_url = endpoint
+
+client = openai.OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+    base_url=endpoint
+)
+
+embedding = client.embeddings.create(
+    input="This is some example input",
+    model="BAAI/bge-m3"
+)
+```

From 217c570f6a148bf3f633d22c6881f7cb252e73e8 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Tue, 17 Sep 2024 00:10:45 -0700
Subject: [PATCH 4/7] docs: added benchmarks

---
 vector-inference/introduction.mdx | 57 +++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx
index c450ed3..b205c7e 100644
--- a/vector-inference/introduction.mdx
+++ b/vector-inference/introduction.mdx
@@ -1,8 +1,58 @@
 ---
 title: Introduction
-description: Trieve Vector Inference is an on-prem solution for fast vector infernece
+description: Trieve Vector Inference is an on-prem solution for fast vector inference
 ---
 
+## Inspiration
+
+SaSS offerings for text embeddings have 2 major issues:
+1) They have higher latency, due to batch processing.
+2) They have heavy rate limits.
+
+Trieve Vector Inference was created so you could Host Dedicated embedding servers in your own cloud.
+
+## Performance Difference
+
+Benchmarks ran using [wrk2](https://github.com/giltene/wrk2) over 30 seconds on 12 threads and 40 active connections.
+
+Machine used to test was on `m5.large` in `us-west-1`.
+
+<Tabs>
+
+<Tab title="10 requests / sec">
+|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
+|-------------|---------------|----------------|-------------|----------|----------|
+| P50 Latency | 193.15 ms     | 179.33 ms      | <span class="text-primary"> 19.06 ms</span>    | <span class="text-primary"> 14.69 ms</span> | <span class="text-primary"> 21.36 ms</span> |
+| P90 Latency | 261.25 ms     | 271.87 ms      | <span class="text-primary"> 23.09 ms</span>    | <span class="text-primary"> 16.90 ms</span> | <span class="text-primary"> 29.81 ms</span> |
+| P99 Latency | 621.05 ms     | 402.43 ms      | <span class="text-primary"> 24.27 ms</span>    | <span class="text-primary"> 18.80 ms</span> | <span class="text-primary"> 30.29 ms</span> |
+| Requests Made  | 324           | 324            | <span class="text-primary"> 324</span>         | <span class="text-primary"> 324</span>      | <span class="text-primary"> 324 </span>     |
+| Requests Failed | 0           | 0            | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>      | <span class="text-primary"> 0 </span>     |
+</Tab>
+
+<Tab title="100 requests / sec">
+|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
+|-------------|---------------|----------------|-------------|----------|----------|
+| P50 Latency | 180.74 ms     | 182.62 ms      | <span class="text-primary"> 16.48 ms</span>    | <span class="text-primary"> 14.35 ms</span> | <span class="text-primary"> 23.22 ms</span> |
+| P90 Latency | 222.34 ms     | 262.65 ms      | <span class="text-primary"> 20.70 ms</span>    | <span class="text-primary"> 16.15 ms</span> | <span class="text-primary"> 29.71 ms</span> |
+| P99 Latency | 1.11 sec      | 363.01 ms      | <span class="text-primary"> 22.82 ms</span>    | <span class="text-primary"> 19.82 ms</span> | <span class="text-primary"> 31.07 ms</span> |
+| Requests Made  | 2,991          | 2,991           | <span class="text-primary"> 3,015</span>        | <span class="text-primary"> 3,024</span>     | <span class="text-primary"> 3,024</span>     |
+| Requests Failed | 0             | 2,986           | <span class="text-primary">0</span>          | <span class="text-primary"> 0</span>        | <span class="text-primary"> 0 </span>       |
+</Tab>
+
+<Tab title="1000 requests / sec">
+|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina | TVI BGE-M3    | TVI Nomic    | 
+|-------------|---------------|----------------|-------------|-----------|----------|
+| P50 Latency | 15.70 sec     | 15.82 sec      | <span class="text-primary">24.40 ms</span>    | <span class="text-primary"> 14.86 ms</span>  | <span class="text-primary"> 23.74 ms</span> |
+| P90 Latency | 22.01 sec     | 21.91 sec      | <span class="text-primary"> 25.14 ms</span>    | <span class="text-primary"> 17.81 ms</span>  | <span class="text-primary"> 31.74 ms</span> |
+| P99 Latency | 23.59 sec     | 23.12 sec      | <span class="text-primary"> 27.61 ms</span>    | <span class="text-primary"> 19.52 ms</span>  | <span class="text-primary"> 34.11 ms</span> |
+| Requests Made  | 6,234          | 6,771           | <span class="text-primary"> 30,002</span>       | <span class="text-primary"> 30,002</span>     | <span class="text-primary"> 30,001</span>    |
+| Requests Failed | 0             | 6,711           | <span class="text-primary"> 0</span>           | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>        |
+</Tab>
+
+</Tabs>
+
+\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM)
+
 ## Quick Start
 
 <CardGroup>
@@ -22,8 +72,3 @@ description: Trieve Vector Inference is an on-prem solution for fast vector infe
     Check out the API Reference to see all of the available endpoints for Trieve Vector Inference
   </Card>
 </CardGroup>
-
-
-## Performance Difference
-
-Refer to our benchmarks

From 6064c544d803ab1cb6970583de96d0cc56217b77 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Tue, 17 Sep 2024 13:46:50 -0700
Subject: [PATCH 5/7] docs: added trouble shooting guide and pricing

---
 vector-inference/pricing.mdx         | 64 ++++++++++++++++++++++++++++
 vector-inference/troubleshooting.mdx | 51 ++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 vector-inference/pricing.mdx
 create mode 100644 vector-inference/troubleshooting.mdx

diff --git a/vector-inference/pricing.mdx b/vector-inference/pricing.mdx
new file mode 100644
index 0000000..8966037
--- /dev/null
+++ b/vector-inference/pricing.mdx
@@ -0,0 +1,64 @@
+---
+title: Pricing
+description: The pricing design Trieve Vector Inference
+mode: wide
+icon: money-bill
+---
+
+Trieve Vector Inference is meant to be an on-prem solution a license is needed for use.
+
+To obtain a license for Trieve Vector Inference contact us:
+
+- Email us at humans@trieve.ai
+- [book a meeting](https://cal.com/nick.k/meet)
+- Call us @ 628-222-4090
+
+<CardGroup cols={3}>
+
+  <Card title="Startup Tier">
+    <div class="flex flex-col">
+      <div class="mb-10">
+	<p class="text-2xl text-primary"> $0* </p>
+	per month
+      </div>
+
+      <div class="flex flex-col space-y-2">
+	<div> Hosting License </div>
+	<div> Unlimited Clusters </div>
+      </div>
+    </div>
+  </Card>
+
+  <Card title="Pro Tier" horizontal>
+    <div class="flex flex-col">
+      <div class="mb-10">
+	<p class="text-2xl text-primary"> $500 </p>
+	per month
+      </div>
+
+      <div class="flex flex-col space-y-2">
+	<div> Hosting License </div>
+	<div> Unlimited Clusters </div>
+	<div> Dedicated Slack Support </div>
+      </div>
+
+    </div>
+  </Card>
+
+  <Card title="Enterprise Tier">
+    <div class="flex flex-col">
+      <div class="mb-10">
+	<p class="text-2xl text-primary"> $1000+</p>
+	per month
+      </div>
+
+      <div class="flex flex-col space-y-2">
+	<div> Hosting License </div>
+	<div> Unlimited Clusters </div>
+	<div> Dedicated Slack Support </div>
+	<div> 99.9% SLA </div>
+	<div> Managed and hosted by Trieve </div>
+      </div> </div> </Card>
+</CardGroup>
+
+\* Free for < 10 employees or Pre-seed
diff --git a/vector-inference/troubleshooting.mdx b/vector-inference/troubleshooting.mdx
new file mode 100644
index 0000000..355b91b
--- /dev/null
+++ b/vector-inference/troubleshooting.mdx
@@ -0,0 +1,51 @@
+---
+title: Troubleshooting
+icon: 'triangle-exclamation'
+description: 'Common issues with self hosting'
+---
+
+There are a lot of moving parts in `eksctl`. Here’s a list of common issues we’ve seen customers run into:
+
+<AccordionGroup>
+  <Accordion title='Error while deleting "1 pods are unevictable from node ip"'>
+
+    This error happens when deleting the cluster and some pods in `kube-system` refuse to stop.
+    To fix this run the following command and the deletion process should be able to proceed.
+
+    ```sh
+    kubectl get pods -n kube-system -o NAME | xargs kubectl -n kube-system delete
+    ```
+  </Accordion>
+
+  <Accordion title='Error while deleting "cleaning up AWS load balancers created by Kubernetes objects of Kind Service or Ingress"'>
+    This happens when the cluster doesn't properly delete load balancers, to fix this
+
+    <Steps>
+    <Step title="Get load balancers to be deleted">
+    Run this to get the available load balancers
+    ```sh
+    kubectl get ingress
+    ```
+
+
+    The output should look like this
+    ```
+    NAME                                           CLASS   HOSTS   ADDRESS                                                                  PORTS   AGE
+    vector-inference-embedding-bgem3-ingress       alb     *       k8s-default-vectorin-25e84e25f0-1362792264.us-east-2.elb.amazonaws.com   80      3d19h
+    vector-inference-embedding-nomic-ingress       alb     *       k8s-default-vectorin-eb664ce6e9-238019709.us-east-2.elb.amazonaws.com    80      2d20h
+    vector-inference-embedding-spladedoc-ingress   alb     *       k8s-default-vectorin-8af81ad2bd-192706382.us-east-2.elb.amazonaws.com    80      3d19h
+    ```
+    
+    </Step>
+
+    <Step title="Delete Extra Load Balancers">
+      Go to EC2 > LoadBalancers ([link](https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#LoadBalancers:v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false)) and delete the alb's that have the ingress point names
+    </Step>
+
+  
+    <Step title="Restart the delete script, but it should auto resume">
+      The delete script should be able to resume
+    </Step>
+    </Steps>
+  </Accordion>
+</AccordionGroup>

From 16746df3b819f1e021e44f026c6710b9391a69cb Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Tue, 17 Sep 2024 13:47:19 -0700
Subject: [PATCH 6/7] cleanup: cleaned up icons and extra descriptions

---
 mint.json                             |  6 ++-
 vector-inference/aws-installation.mdx |  9 +++-
 vector-inference/dense.mdx            |  3 ++
 vector-inference/embed.mdx            |  2 +-
 vector-inference/embed_all.mdx        |  2 +-
 vector-inference/embed_sparse.mdx     |  2 +-
 vector-inference/introduction.mdx     | 11 +++--
 vector-inference/openai.mdx           | 70 +++++++++++++++------------
 vector-inference/openai_compat.mdx    |  2 +-
 vector-inference/rerank.mdx           | 15 +++++-
 vector-inference/reranker.mdx         |  2 +-
 vector-inference/splade.mdx           | 20 +++++++-
 12 files changed, 94 insertions(+), 50 deletions(-)

diff --git a/mint.json b/mint.json
index 7f9ddd0..b8404d5 100644
--- a/mint.json
+++ b/mint.json
@@ -71,7 +71,8 @@
         "getting-started/quickstart",
         "getting-started/trieve-primitives",
         "getting-started/screenshots",
-        "vector-inference/introduction"
+        "vector-inference/introduction",
+        "vector-inference/pricing"
       ]
     },
     {
@@ -81,7 +82,8 @@
         "self-hosting/local-kube",
         "self-hosting/aws",
         "self-hosting/gcp",
-        "vector-inference/aws-installation"
+        "vector-inference/aws-installation",
+        "vector-inference/troubleshooting"
       ]
     },
     {
diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx
index b07d6e2..ed82233 100644
--- a/vector-inference/aws-installation.mdx
+++ b/vector-inference/aws-installation.mdx
@@ -1,6 +1,6 @@
 ---
 title: 'AWS Installation'
-description: 'Install Trieve Vector Inference'
+description: 'Install Trieve Vector Inference in your own aws account'
 icon: 'aws'
 ---
 
@@ -15,7 +15,12 @@ You'll also need a license to run Trieve Vector Inference
 
 ### Getting your license
 
-(contact us here)
+Contact us:
+- Email us at humans@trieve.ai
+- [book a meeting](https://cal.com/nick.k/meet)
+- Call us @ 628-222-4090
+
+Our pricing is [here](/vector-inference/pricing)
 
 ## Check AWS quota
 
diff --git a/vector-inference/dense.mdx b/vector-inference/dense.mdx
index b476c3d..00cce87 100644
--- a/vector-inference/dense.mdx
+++ b/vector-inference/dense.mdx
@@ -1,5 +1,8 @@
 ---
 title: 'Using Custom Models'
+icon: brackets-curly
+description: How to use gated or private models hosted on huggingface
+mode: wide
 ---
 
 ## Custom or fine tuned models in Trieve Vector Inference
diff --git a/vector-inference/embed.mdx b/vector-inference/embed.mdx
index 07248bc..5cb4153 100644
--- a/vector-inference/embed.mdx
+++ b/vector-inference/embed.mdx
@@ -22,7 +22,7 @@ Generating an embedding from a dense embedding model
 curl -X POST \
      -H "Content-Type: application/json"\
      -d '{"inputs": "test input"}' \
-     --url http://<your-url>/embed 
+     --url "http://$ENDPOINT/embed" 
 ```
 
 ```py python
diff --git a/vector-inference/embed_all.mdx b/vector-inference/embed_all.mdx
index 5e37c47..2d46cc0 100644
--- a/vector-inference/embed_all.mdx
+++ b/vector-inference/embed_all.mdx
@@ -21,7 +21,7 @@ Generating an embedding from a dense embedding model
 curl -X POST \
      -H "Content-Type: application/json"\
      -d '{"inputs": "test input"}' \
-     --url http://<your-url>/embed_all 
+     --url http://$ENDPOINT/embed_all 
 ```
 
 ```py python
diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx
index 2ceee6a..59d18b9 100644
--- a/vector-inference/embed_sparse.mdx
+++ b/vector-inference/embed_sparse.mdx
@@ -22,7 +22,7 @@ The main one that we support right now are the Splade models
 curl -X POST \
      -H "Content-Type: application/json"\
      -d '{"inputs": "test input"}' \
-     --url http://<your-url>/embed_sparse
+     --url http://$ENDPOINT/embed_sparse
 ```
 
 ```py python
diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx
index b205c7e..d308707 100644
--- a/vector-inference/introduction.mdx
+++ b/vector-inference/introduction.mdx
@@ -1,6 +1,7 @@
 ---
 title: Introduction
 description: Trieve Vector Inference is an on-prem solution for fast vector inference
+icon: rocket
 ---
 
 ## Inspiration
@@ -51,24 +52,24 @@ Machine used to test was on `m5.large` in `us-west-1`.
 
 </Tabs>
 
-\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM)
+\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM for premium plan)
 
-## Quick Start
+## See more
 
 <CardGroup>
   <Card title="AWS On-Prem Installation" icon="aws" href="/vector-inference/aws-installation">
     Adding Trieve Vector Inference into your AWS account
   </Card>
 
-  <Card title="Creating Embeddings" icon="webhook" href="/vector-inference/embed">
+  <Card title="Creating Embeddings" icon="vector-circle" href="/vector-inference/embed">
     Using the `/embed` route
   </Card>
 
-  <Card title="Custom Embedding Models" icon="webhook" href="/vector-inference/dense">
+  <Card title="Custom Embedding Models" icon="brackets-curly" href="/vector-inference/dense">
     Check out the API Reference to see all of the available endpoints for Trieve Vector Inference
   </Card>
 
-  <Card title="Splade V2" icon="webhook" href="/vector-inference/splade">
+  <Card title="Splade V2" icon="magnifying-glass" href="/vector-inference/splade">
     Check out the API Reference to see all of the available endpoints for Trieve Vector Inference
   </Card>
 </CardGroup>
diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx
index e2a0c3c..778df13 100644
--- a/vector-inference/openai.mdx
+++ b/vector-inference/openai.mdx
@@ -1,39 +1,45 @@
 ---
 title: "Using OpenAI SDK"
+icon: microchip-ai
+description: How to integrate TVI with existing openai compatible endpoints
 ---
 
 Trieve Vector Inference is compatible with the OpenAI api. This means you're able to just replace the endpoint, without changing any pre-existing code.
 Here's an example with the `openai` python sdk
 
-First install the dependencies 
-
-```sh
-pip install openai
-pip install requests
-pip install python-dotenv
-```
-
-```python openai_compatibility.py
-import openai
-import time
-import requests
-import os
-from dotenv import load_dotenv
-
-load_dotenv()
-
-endpoint = "http://<your-ingress-endpoint-here>"
-
-openai.base_url = endpoint
-
-client = openai.OpenAI(
-    # This is the default and can be omitted
-    api_key=os.environ.get("OPENAI_API_KEY"),
-    base_url=endpoint
-)
-
-embedding = client.embeddings.create(
-    input="This is some example input",
-    model="BAAI/bge-m3"
-)
-```
+<Steps>
+    <Step title="Install the dependencies">
+	```sh
+	pip install openai requests python-dotenv
+	```
+    </Step>
+
+    <Step title="Update OpenAI url">
+	Replace `base_url` with your embeddding endpoint.
+
+	```python openai_compatibility.py
+	import openai
+	import time
+	import requests
+	import os
+	from dotenv import load_dotenv
+
+	load_dotenv()
+
+	endpoint = "http://<your-ingress-endpoint-here>"
+
+	openai.base_url = endpoint
+
+	client = openai.OpenAI(
+	    # This is the default and can be omitted
+	    api_key=os.environ.get("OPENAI_API_KEY"),
+	    base_url=endpoint
+	)
+
+	embedding = client.embeddings.create(
+	    input="This is some example input",
+	    model="BAAI/bge-m3"
+	)
+	```
+    </Step>
+</Steps>
diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx
index 00e0430..76bc121 100644
--- a/vector-inference/openai_compat.mdx
+++ b/vector-inference/openai_compat.mdx
@@ -21,7 +21,7 @@ Generating an embedding from a dense embedding model
 curl -X POST \
      -H "Content-Type: application/json"\
      -d '{"input": "test input"}' \
-     --url http://<your-url>/v1/embeddings 
+     --url http://$ENDPOINT/v1/embeddings 
 ```
 
 ```py python
diff --git a/vector-inference/rerank.mdx b/vector-inference/rerank.mdx
index 14ed536..fed542d 100644
--- a/vector-inference/rerank.mdx
+++ b/vector-inference/rerank.mdx
@@ -1,12 +1,14 @@
 ---
 title: "Working with Reranker"
+mode: wide
+icon: arrow-up-arrow-down
 ---
 
 ## What is a Reranker / CrossEncoder?
 
-`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion
+A `Reranker` model provides a powerful semantic boost to the search quality of any keyword or vector search system without requiring any overhaul or replacement.
 
-## Using with Trieve Vector Inference
+## Using Rerankers with Trieve Vector Inference
 
 <Steps>
 <Step title="Update embedding_models.yaml">
@@ -38,5 +40,14 @@ helm upgrade -i vector-inference \
 ```sh
 kubectl get ing
 ```
+
+```
+NAME                                              CLASS   HOSTS   ADDRESS                                                                 PORTS   AGE
+vector-inference-embedding-bge-reranker-ingress   alb     *       k8s-default-vectorin-b09efe8cf6-890425945.us-west-1.elb.amazonaws.com   80      77m
+```
+
+The output looks like this
+
+
 </Step>
 </Steps>
diff --git a/vector-inference/reranker.mdx b/vector-inference/reranker.mdx
index 1e1e6a2..023f723 100644
--- a/vector-inference/reranker.mdx
+++ b/vector-inference/reranker.mdx
@@ -34,7 +34,7 @@ curl -X POST \
       "truncate": false,
       "truncation_direction": "right"
     }' \
-     --url http://<your-url>/rerank
+     --url http://$ENDPOINT/rerank
 ```
 
 ```py python
diff --git a/vector-inference/splade.mdx b/vector-inference/splade.mdx
index 4a74b97..a28855d 100644
--- a/vector-inference/splade.mdx
+++ b/vector-inference/splade.mdx
@@ -1,12 +1,15 @@
 ---
 title: "Working with Splade v2"
+icon: magnifying-glass 
+description: Learn how to use splade with TVI.
+mode: wide
 ---
 
 ## What is splade?
 
-`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion
+`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes neural term expansion, meaning that it is able to match on synonym's much better than traditional bm25
 
-## Using with Trieve Vector Inference
+## Using Splade with Trieve Vector Inference
 
 <Steps>
 <Step title="Update embedding_models.yaml">
@@ -44,4 +47,17 @@ helm upgrade -i vector-inference \
 kubectl get ing
 ```
 </Step>
+
+<Step title="Make call to generate sparse vector">
+    ```sh
+    ENDPOINT="k8s-default-vectorin...elb.amazonaws.com"
+
+    curl -X POST \
+         -H "Content-Type: application/json"\
+         -d '{"inputs": "test input"}' \
+         --url http://$ENDPOINT/embed_sparse
+    ```
+
+  For more information checkout the [API reference](/vector-inference/embed_sparse) for sparse vectors
+</Step>
 </Steps>

From 9f40a28b80001a69ad761634492e84b71fff64c8 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Thu, 19 Sep 2024 23:19:02 -0700
Subject: [PATCH 7/7] docs: add SageMaker metrics with wrk2 benchmark

---
 vector-inference/introduction.mdx | 44 ++++++++++++++++---------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx
index d308707..b0ddc66 100644
--- a/vector-inference/introduction.mdx
+++ b/vector-inference/introduction.mdx
@@ -21,39 +21,41 @@ Machine used to test was on `m5.large` in `us-west-1`.
 <Tabs>
 
 <Tab title="10 requests / sec">
-|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
-|-------------|---------------|----------------|-------------|----------|----------|
-| P50 Latency | 193.15 ms     | 179.33 ms      | <span class="text-primary"> 19.06 ms</span>    | <span class="text-primary"> 14.69 ms</span> | <span class="text-primary"> 21.36 ms</span> |
-| P90 Latency | 261.25 ms     | 271.87 ms      | <span class="text-primary"> 23.09 ms</span>    | <span class="text-primary"> 16.90 ms</span> | <span class="text-primary"> 29.81 ms</span> |
-| P99 Latency | 621.05 ms     | 402.43 ms      | <span class="text-primary"> 24.27 ms</span>    | <span class="text-primary"> 18.80 ms</span> | <span class="text-primary"> 30.29 ms</span> |
-| Requests Made  | 324           | 324            | <span class="text-primary"> 324</span>         | <span class="text-primary"> 324</span>      | <span class="text-primary"> 324 </span>     |
-| Requests Failed | 0           | 0            | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>      | <span class="text-primary"> 0 </span>     |
+|             | OPENAI Cloud  | JINA AI Cloud* | JINA (SageMaker)**  | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
+|-------------|---------------|----------------|---------------------|-------------|----------|----------|
+| P50 Latency | 193.15 ms     | 179.33 ms      | 185.21 ms           | <span class="text-primary"> 19.06 ms</span>    | <span class="text-primary"> 14.69 ms</span> | <span class="text-primary"> 21.36 ms</span> |
+| P90 Latency | 261.25 ms     | 271.87 ms      | 296.19 ms           | <span class="text-primary"> 23.09 ms</span>    | <span class="text-primary"> 16.90 ms</span> | <span class="text-primary"> 29.81 ms</span> |
+| P99 Latency | 621.05 ms     | 402.43 ms      | 306.94 ms           | <span class="text-primary"> 24.27 ms</span>    | <span class="text-primary"> 18.80 ms</span> | <span class="text-primary"> 30.29 ms</span> |
+| Requests Made  | 324        | 324            | 324                 | <span class="text-primary"> 324</span>         | <span class="text-primary"> 324</span>      | <span class="text-primary"> 324 </span>     |
+| Requests Failed | 0         | 0              | 3                   | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>      | <span class="text-primary"> 0 </span>     |
 </Tab>
 
 <Tab title="100 requests / sec">
-|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
-|-------------|---------------|----------------|-------------|----------|----------|
-| P50 Latency | 180.74 ms     | 182.62 ms      | <span class="text-primary"> 16.48 ms</span>    | <span class="text-primary"> 14.35 ms</span> | <span class="text-primary"> 23.22 ms</span> |
-| P90 Latency | 222.34 ms     | 262.65 ms      | <span class="text-primary"> 20.70 ms</span>    | <span class="text-primary"> 16.15 ms</span> | <span class="text-primary"> 29.71 ms</span> |
-| P99 Latency | 1.11 sec      | 363.01 ms      | <span class="text-primary"> 22.82 ms</span>    | <span class="text-primary"> 19.82 ms</span> | <span class="text-primary"> 31.07 ms</span> |
-| Requests Made  | 2,991          | 2,991           | <span class="text-primary"> 3,015</span>        | <span class="text-primary"> 3,024</span>     | <span class="text-primary"> 3,024</span>     |
-| Requests Failed | 0             | 2,986           | <span class="text-primary">0</span>          | <span class="text-primary"> 0</span>        | <span class="text-primary"> 0 </span>       |
+|             | OPENAI Cloud  | JINA AI Cloud* | JINA (SageMaker)**  | TVI Jina  | TVI BGE-M3   | TVI Nomic    | 
+|-------------|---------------|----------------|---------------------|-------------|----------|----------|
+| P50 Latency | 180.74 ms     | 182.62 ms      | 515.84 ms           | <span class="text-primary"> 16.48 ms</span>    | <span class="text-primary"> 14.35 ms</span> | <span class="text-primary"> 23.22 ms</span> |
+| P90 Latency | 222.34 ms     | 262.65 ms      | 654.85 ms           | <span class="text-primary"> 20.70 ms</span>    | <span class="text-primary"> 16.15 ms</span> | <span class="text-primary"> 29.71 ms</span> |
+| P99 Latency | 1.11 sec      | 363.01 ms      | 724.48 ms           | <span class="text-primary"> 22.82 ms</span>    | <span class="text-primary"> 19.82 ms</span> | <span class="text-primary"> 31.07 ms</span> |
+| Requests Made  | 2,991      | 2,991          | 2963                | <span class="text-primary"> 3,015</span>        | <span class="text-primary"> 3,024</span>     | <span class="text-primary"> 3,024</span>     |
+| Requests Failed | 0         | 2,986          | 0                   | <span class="text-primary">0</span>          | <span class="text-primary"> 0</span>        | <span class="text-primary"> 0 </span>       |
 </Tab>
 
 <Tab title="1000 requests / sec">
-|             | OPENAI Cloud  | JINA AI Cloud* | TVI Jina | TVI BGE-M3    | TVI Nomic    | 
-|-------------|---------------|----------------|-------------|-----------|----------|
-| P50 Latency | 15.70 sec     | 15.82 sec      | <span class="text-primary">24.40 ms</span>    | <span class="text-primary"> 14.86 ms</span>  | <span class="text-primary"> 23.74 ms</span> |
-| P90 Latency | 22.01 sec     | 21.91 sec      | <span class="text-primary"> 25.14 ms</span>    | <span class="text-primary"> 17.81 ms</span>  | <span class="text-primary"> 31.74 ms</span> |
-| P99 Latency | 23.59 sec     | 23.12 sec      | <span class="text-primary"> 27.61 ms</span>    | <span class="text-primary"> 19.52 ms</span>  | <span class="text-primary"> 34.11 ms</span> |
-| Requests Made  | 6,234          | 6,771           | <span class="text-primary"> 30,002</span>       | <span class="text-primary"> 30,002</span>     | <span class="text-primary"> 30,001</span>    |
-| Requests Failed | 0             | 6,711           | <span class="text-primary"> 0</span>           | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>        |
+|             | OPENAI Cloud  | JINA AI Cloud* | JINA (SageMaker)** | TVI Jina | TVI BGE-M3    | TVI Nomic    | 
+|-------------|---------------|----------------|---------------------|-------------|-----------|----------|
+| P50 Latency | 15.70 sec     | 15.82 sec      | 17.97 sec           | <span class="text-primary">24.40 ms</span>    | <span class="text-primary"> 14.86 ms</span>  | <span class="text-primary"> 23.74 ms</span>  |
+| P90 Latency | 22.01 sec     | 21.91 sec      | 25.30 sec           | <span class="text-primary"> 25.14 ms</span>    | <span class="text-primary"> 17.81 ms</span>  | <span class="text-primary"> 31.74 ms</span> |
+| P99 Latency | 23.59 sec     | 23.12 sec      | 27.03 sec           | <span class="text-primary"> 27.61 ms</span>    | <span class="text-primary"> 19.52 ms</span>  | <span class="text-primary"> 34.11 ms</span> |
+| Requests Made  | 6,234      | 6,771          | 2963                | <span class="text-primary"> 30,002</span>       | <span class="text-primary"> 30,002</span>     | <span class="text-primary"> 30,001</span> |
+| Requests Failed | 0         | 6,711          | 0                   | <span class="text-primary"> 0</span>           | <span class="text-primary"> 0 </span>        | <span class="text-primary"> 0</span>        |
 </Tab>
 
 </Tabs>
 
 \* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM for premium plan)
 
+\** `jina-embeddings-v2-base-en` on Sagemaker with `ml.g4dn.xlarge`
+
 ## See more
 
 <CardGroup>