From 8439ed950e1d8f2c7e3b45891d174fcd4c7efd6d Mon Sep 17 00:00:00 2001 From: cdxker Date: Thu, 12 Sep 2024 14:41:55 -0700 Subject: [PATCH 1/7] docs: baseline for new pages on TVF / TVI / named to be determined --- guides/using-trieve-vector-inference.mdx | 105 ++++++++++++ mint.json | 26 ++- vector-inference/aws-installation.mdx | 177 +++++++++++++++++++++ vector-inference/dense.mdx | 45 ++++++ vector-inference/embed.mdx | 113 +++++++++++++ vector-inference/embed_all.mdx | 114 +++++++++++++ vector-inference/embed_sparse.mdx | 127 +++++++++++++++ vector-inference/introduction.mdx | 29 ++++ vector-inference/openai.mdx | 3 + vector-inference/openai_compat.mdx | 134 ++++++++++++++++ vector-inference/rerank.mdx | 42 +++++ vector-inference/reranker.mdx | 140 ++++++++++++++++ vector-inference/splade.mdx | 47 ++++++ vector-inference/working-with-reranker.mdx | 0 14 files changed, 1099 insertions(+), 3 deletions(-) create mode 100644 guides/using-trieve-vector-inference.mdx create mode 100644 vector-inference/aws-installation.mdx create mode 100644 vector-inference/dense.mdx create mode 100644 vector-inference/embed.mdx create mode 100644 vector-inference/embed_all.mdx create mode 100644 vector-inference/embed_sparse.mdx create mode 100644 vector-inference/introduction.mdx create mode 100644 vector-inference/openai.mdx create mode 100644 vector-inference/openai_compat.mdx create mode 100644 vector-inference/rerank.mdx create mode 100644 vector-inference/reranker.mdx create mode 100644 vector-inference/splade.mdx create mode 100644 vector-inference/working-with-reranker.mdx diff --git a/guides/using-trieve-vector-inference.mdx b/guides/using-trieve-vector-inference.mdx new file mode 100644 index 0000000..85a3acc --- /dev/null +++ b/guides/using-trieve-vector-inference.mdx @@ -0,0 +1,105 @@ +--- +title: 'Install Trieve vector inference' +description: 'Install Trieve Vector Inference' +icon: 'files' +--- + +## Installation Requirements + +- `eksctl` >= 0.171 ([eksctl installation guide](https://eksctl.io/installation)) +- `aws` >= 2.15 ([aws installation guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) +- `kubectl` >= 1.28 ([kubectl installation guide](https://kubernetes.io/docs/tasks/tools/#kubectl)) +- `helm` >= 3.14 ([helm installation guide](https://helm.sh/docs/intro/install/#helm)) + +You'll also need a license to run Trieve Vector Inference + +### Getting your license + +(contact us here) + +## Check AWS quota + +Ensure you have quotas for +1) At least **4 vCPUs** for On-Demand G and VT instances in the region of choice. + +Check quota for *us-east-2* [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF) + +- At least **1 load-balancer per each model you want. + +Check quota for *us-east-2* [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF) + +## Deploying the Cluster + +### Setting up environment variables + +Create eks cluster and install needed plugins + +Your AWS Account ID: +```sh +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)" +``` + +Your AWS REGION: +```sh +export AWS_REGION=us-east-2 +``` + +Your Kubernetes cluster name: + +```sh +export CLUSTER_NAME=trieve-gpu +``` + +Your machine types, we recommend `g4dn.xlarge`, as it is the cheapest on AWS. A single small node is needed for extra utility. + +```sh +export CPU_INSTANCE_TYPE=t3.small +export GPU_INSTANCE_TYPE=g4dn.xlarge +export GPU_COUNT=1 +``` + +### Create your cluster + +```sh +curl ./create_cluster.sh | sh +``` + +This will take around 25 minutes to complete + +## Install Trieve Vector Inference + +### Specify your embedding models + +Modify `embedding_models.yaml` for the models that you want to use + +### Install the helm chart + +```sh +helm upgrade -i vector-inference oci://registry-1.docker.io/trieve/embeddings-helm -f embedding_models.yaml +``` + +### Get your model endpoints + +```sh +kubectl get ingress +``` + +![](./assets/ingress.png) + +## Using Trieve Vector Inference + +```sh +curl -X POST -H "Content-Type: application/json" -d '{"inputs": "cancer" ,"model": "en"} +``` + +## Optional: Delete the cluster + +```sh +cluster_name=trieve-gpu +region=us-east-2 + +helm uninstall vector-release +helm uninstall nvdp -n kube-system +helm uninstall aws-load-balancer-controller -n kube-system +eksctl delete cluster --region=${REGION} --name=${CLUSTER_NAME} +``` diff --git a/mint.json b/mint.json index 740ad85..7f9ddd0 100644 --- a/mint.json +++ b/mint.json @@ -35,6 +35,10 @@ { "name": "API Reference", "url": "api-reference" + }, + { + "name": "Vector Inference", + "url": "vector-inference" } ], "anchors": [ @@ -66,7 +70,8 @@ "getting-started/introduction", "getting-started/quickstart", "getting-started/trieve-primitives", - "getting-started/screenshots" + "getting-started/screenshots", + "vector-inference/introduction" ] }, { @@ -75,7 +80,8 @@ "self-hosting/docker-compose", "self-hosting/local-kube", "self-hosting/aws", - "self-hosting/gcp" + "self-hosting/gcp", + "vector-inference/aws-installation" ] }, { @@ -85,7 +91,21 @@ "guides/uploading-files", "guides/searching-with-trieve", "guides/recommending-with-trieve", - "guides/RAG-with-trieve" + "guides/RAG-with-trieve", + "vector-inference/rerank", + "vector-inference/splade", + "vector-inference/dense", + "vector-inference/openai" + ] + }, + { + "group": "API Reference", + "pages": [ + "vector-inference/embed", + "vector-inference/embed_all", + "vector-inference/embed_sparse", + "vector-inference/reranker", + "vector-inference/openai_compat" ] }, { diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx new file mode 100644 index 0000000..0d539d8 --- /dev/null +++ b/vector-inference/aws-installation.mdx @@ -0,0 +1,177 @@ +--- +title: 'AWS Installation' +description: 'Install Trieve Vector Inference' +icon: 'aws' +--- + +## Installation Requirements + +- `eksctl` >= 0.171 ([eksctl installation guide](https://eksctl.io/installation)) +- `aws` >= 2.15 ([aws installation guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) +- `kubectl` >= 1.28 ([kubectl installation guide](https://kubernetes.io/docs/tasks/tools/#kubectl)) +- `helm` >= 3.14 ([helm installation guide](https://helm.sh/docs/intro/install/#helm)) + +You'll also need a license to run Trieve Vector Inference + +### Getting your license + +(contact us here) + +## Check AWS quota + + + Ensure you have quotas for Both GPU's and Load Balancers. + + +1) At least **4 vCPUs** for On-Demand G and VT instances in the region of choice. + +Check quota for [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF) + +2) You will need **1 load-balancer** per each model you want. + +Check quotas for [here](https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas/L-3819A6DF) + +## Deploying the Cluster + +### Setting up environment variables + +Create eks cluster and install needed plugins + +Your AWS Account ID: +```sh +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)" +``` + +Your AWS REGION: +```sh +export AWS_REGION=us-east-2 +``` + +Your Kubernetes cluster name: + +```sh +export CLUSTER_NAME=trieve-gpu +``` + +Your machine types, we recommend `g4dn.xlarge`, as it is the cheapest on AWS. A single small node is needed for extra utility. + +```sh +export CPU_INSTANCE_TYPE=t3.small +export GPU_INSTANCE_TYPE=g4dn.xlarge +export GPU_COUNT=1 +``` + +**To use our reccomended defaults** + +```sh +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)" +export AWS_REGION=us-east-2 +export CLUSTER_NAME=trieve-gpu +export CPU_INSTANCE_TYPE=t3.small +export GPU_INSTANCE_TYPE=g4dn.xlarge +export GPU_COUNT=1 +``` + +### Create your cluster + +Download the bootstrap script +```sh +wget cdn.trieve.ai/bootstrap-eks.sh +``` + +Run the bootstrap script with bash + +```sh +bash bootstrap-eks.sh +``` + +This will take around 25 minutes to complete + +## Install Trieve Vector Inference + +### Configure `embedding_models.yaml` + +Frist download the example configiguration file + +```sh +wget https://cdn.trieve.ai/embedding_models.yaml +``` + +Now you can modify your `embedding_models.yaml`, this defines all the models that you will want to use + +```yaml embedding_models.yaml +accessKey: "" + +models: + bgeM3: + replicas: 2 + revision: main + modelName: BAAI/bge-m3 # The end of the URL https://huggingface.co/BAAI/bge-m3 + hfToken: "" # If you have a private hugging face repo + spladeDoc: + replicas: 2 + modelName: naver/efficient-splade-VI-BT-large-doc # The end of the URL https://huggingface.co/naver/efficient-splade-VI-BT-large-doc + isSplade: true + spladeQuery: + replicas: 2 + modelName: naver/efficient-splade-VI-BT-large-doc # The end of the URL https://huggingface.co/naver/efficient-splade-VI-BT-large-doc + isSplade: true + bge-reranker: + replicas: 2 + modelName: BAAI/bge-reranker-large + isSplade: false +``` + +### Install the helm chart + +```sh +helm upgrade -i vector-inference \ + oci://registry-1.docker.io/trieve/embeddings-helm \ + -f embedding_models.yaml +``` + +### Get your model endpoints + +```sh +kubectl get ingress +``` + +The output looks something like this: + +``` +NAME CLASS HOSTS ADDRESS PORTS AGE +vector-inference-embedding-bge-reranker-ingress alb * k8s-default-vectorin-18b7ade77a-2040086997.us-east-2.elb.amazonaws.com 80 73s +vector-inference-embedding-bgem3-ingress alb * k8s-default-vectorin-25e84e25f0-1362792264.us-east-2.elb.amazonaws.com 80 73s +vector-inference-embedding-spladedoc-ingress alb * k8s-default-vectorin-8af81ad2bd-192706382.us-east-2.elb.amazonaws.com 80 72s +vector-inference-embedding-spladequery-ingress alb * k8s-default-vectorin-10404abaee-1617952667.us-east-2.elb.amazonaws.com 80 3m20s +``` + +## Using Trieve Vector Inference + +Each `ingress` point will be using their own Application Load Balancer within AWS. The `Address` provided is the model's endpoint that you can make [dense embeddings](/vector-inference/embed), [sparse embeddings](/vector-inference/embed_sparse), or [reranker calls](/vector-inference/reranker) based on the models you chose + +Check out the guides for more information on configureation + + + + How to setup a dedicated instance for the sparse splade embedding model + + + How to use private or gated hugging face models. Or any models that you want + + + Trieve Vector Inference has openai compatible routes + + + +## Optional: Delete the cluster + +```sh +cluster_name=trieve-gpu +region=us-east-2 + +helm uninstall vector-release +helm uninstall nvdp -n kube-system +helm uninstall aws-load-balancer-controller -n kube-system +eksctl delete cluster --region=${REGION} --name=${CLUSTER_NAME} +``` diff --git a/vector-inference/dense.mdx b/vector-inference/dense.mdx new file mode 100644 index 0000000..b476c3d --- /dev/null +++ b/vector-inference/dense.mdx @@ -0,0 +1,45 @@ +--- +title: 'Using Custom Models' +--- + +## Custom or fine tuned models in Trieve Vector Inference + +The [open source text models](https://huggingface.co/spaces/mteb/leaderboard) on hugging face may not be what you always want, + + + + +To use a private or custom model with Trieve Vector Inference, you will need to update your `embedding_models.yaml` file. + +If the model is a private or gated hugging face model, you will need to include your huggingface api token + +```yaml embedding_models.yaml +... +models: + ... + my-custom-model: + replicas: 1 + revision: main + modelName: trieve/private-model-example + hfToken: "hf_**********************************" +... +``` + + + +Update TVI to include your models + +```bash +helm upgrade -i vector-inference \ + oci://registry-1.docker.io/trieve/embeddings-helm \ + -f embedding_models.yaml +``` + + + +```sh +kubectl get ing +``` + + + diff --git a/vector-inference/embed.mdx b/vector-inference/embed.mdx new file mode 100644 index 0000000..07248bc --- /dev/null +++ b/vector-inference/embed.mdx @@ -0,0 +1,113 @@ +--- +title: 'Create Embedding' +sidebarTitle: 'POST /embed' +description: 'Get Embeddings. Returns a 424 status code if the model is not an embedding model' +--- + +Generating an embedding from a dense embedding model + + + +```json RAW Json +{ + "inputs": "The model input", + "prompt_name": null, + "normalize": true, + "truncate": false, + "truncation_direction": "right" +} +``` + +```sh curl +curl -X POST \ + -H "Content-Type: application/json"\ + -d '{"inputs": "test input"}' \ + --url http:///embed +``` + +```py python +import requests + +endpoint = "" + +requests.post(f"{endpoint}/embed", json={ + "inputs": ["test input", "test input 2"] +}); + +## or + +requests.post(f"{endpoint}/embed", json={ + "inputs": "test single input" +}); +``` + + + + +```json 200 Embeddings +[ + [ + 0.038483415, + -0.00076982786, + -0.020039458 + ... + ], + [ + 0.04496114, + -0.039057795, + -0.022400795, + ... + ] +] +``` + +```json 413 +{ + "error": "Batch size error", + "error_type": "validation" +} +``` + +```json 422 +{ + "error": "Tokenization error", + "error_type": "validation" +} +``` + +```json 424 +{ + "error": "Inference failed", + "error_type": "backend" +} +``` + +```json 429 +{ + "error": "Model is overloaded", + "error_type": "overloaded" +} +``` + + + + Inputs that need to be embedded + + + + + + +The name of the prompt that should be used by for encoding. If not set, no prompt will be applied. + +Must be a key in the `sentence-transformers` configuration prompts dictionary. + +For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode. + + + +Automatically truncate inputs that are longer than the maximum supported size + + + + diff --git a/vector-inference/embed_all.mdx b/vector-inference/embed_all.mdx new file mode 100644 index 0000000..5e37c47 --- /dev/null +++ b/vector-inference/embed_all.mdx @@ -0,0 +1,114 @@ +--- +title: 'Create Embedding' +sidebarTitle: 'POST /embed_all' +description: 'Get Embeddings. Returns a 424 status code if the model is not an embedding model' +--- + +Generating an embedding from a dense embedding model + + + +```json RAW Json +{ + "inputs": "The model input", + "prompt_name": null, + "truncate": false, + "truncation_direction": "right" +} +``` + +```sh curl +curl -X POST \ + -H "Content-Type: application/json"\ + -d '{"inputs": "test input"}' \ + --url http:///embed_all +``` + +```py python +import requests + +endpoint = "" + +requests.post(f"{endpoint}/embed_all", json={ + "inputs": ["test input", "test input 2"] +}); + +## or + +requests.post(f"{endpoint}/embed_all", json={ + "inputs": "test single input" +}); + + +``` + + + + +```json 200 Embeddings +[ + [ + 0.038483415, + -0.00076982786, + -0.020039458 + ... + ], + [ + 0.04496114, + -0.039057795, + -0.022400795, + ... + ] +] +``` + +```json 413 +{ + "error": "Batch size error", + "error_type": "validation" +} +``` + +```json 422 +{ + "error": "Tokenization error", + "error_type": "validation" +} +``` + +```json 424 +{ + "error": "Inference failed", + "error_type": "backend" +} +``` + +```json 429 +{ + "error": "Model is overloaded", + "error_type": "overloaded" +} +``` + + + + + + Inputs that need to be embedded + + + +The name of the prompt that should be used by for encoding. If not set, no prompt will be applied. + +Must be a key in the `sentence-transformers` configuration prompts dictionary. + +For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode. + + + +Automatically truncate inputs that are longer than the maximum supported size + + + + + diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx new file mode 100644 index 0000000..44a7c98 --- /dev/null +++ b/vector-inference/embed_sparse.mdx @@ -0,0 +1,127 @@ +--- +title: 'Create Sparse Embedding' +sidebarTitle: 'POST /embed_sparse' +description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a splade embedding model' +--- + +Generating an embedding from a sparse embedding model. +The main one that we support right now are the splade models + + + +```json RAW Json +{ + "inputs": "The model input", + "prompt_name": null, + "truncate": false, + "truncation_direction": "right" +} +``` + +```sh curl +curl -X POST \ + -H "Content-Type: application/json"\ + -d '{"inputs": "test input"}' \ + --url http:///embed_sparse +``` + +```py python +import requests + +endpoint = "" + +requests.post(f"{endpoint}/embed_sparse", json={ + "inputs": ["test input", "test input 2"] +}); + +## or + +requests.post(f"{endpoint}/embed_sparse", json={ + "inputs": "test single input" +}); + + +``` + + + + +```json 200 Embeddings +[ + // Embedding 1 + [ + { + "index": 1012, + "value": 0.9970703 + }, + { + "index": 4456, + "value": 2.7832031 + } + ], + // Embedding 2 + [ + { + "index": 990, + "value": 2.783203 + }, + { + "index": 3021, + "value": 10.9970703 + }, + ... + ], + ... +] +``` + +```json 413 +{ + "error": "Batch size error", + "error_type": "validation" +} +``` + +```json 422 +{ + "error": "Tokenization error", + "error_type": "validation" +} +``` + +```json 424 +{ + "error": "Inference failed", + "error_type": "backend" +} +``` + +```json 429 +{ + "error": "Model is overloaded", + "error_type": "overloaded" +} +``` + + + + + + Inputs that need to be embedded + + + +The name of the prompt that should be used by for encoding. If not set, no prompt will be applied. + +Must be a key in the `sentence-transformers` configuration prompts dictionary. + +For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode. + + + +Automatically truncate inputs that are longer than the maximum supported size + + + + + diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx new file mode 100644 index 0000000..c450ed3 --- /dev/null +++ b/vector-inference/introduction.mdx @@ -0,0 +1,29 @@ +--- +title: Introduction +description: Trieve Vector Inference is an on-prem solution for fast vector infernece +--- + +## Quick Start + + + + Adding Trieve Vector Inference into your AWS account + + + + Using the `/embed` route + + + + Check out the API Reference to see all of the available endpoints for Trieve Vector Inference + + + + Check out the API Reference to see all of the available endpoints for Trieve Vector Inference + + + + +## Performance Difference + +Refer to our benchmarks diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx new file mode 100644 index 0000000..f22e933 --- /dev/null +++ b/vector-inference/openai.mdx @@ -0,0 +1,3 @@ +--- +title: "OpenAI compatible interface" +--- diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx new file mode 100644 index 0000000..a567dd5 --- /dev/null +++ b/vector-inference/openai_compat.mdx @@ -0,0 +1,134 @@ +--- +title: 'OpenAI compatible embeddings route' +sidebarTitle: 'POST /v1/embeddings' +description: 'OpenAI compatible route. Returns a 424 status code if the model is not an embedding model' +--- + +Generating an embedding from a dense embedding model + + + +```json Raw JSON +{ + "encoding_format": "float", + "input": "string", + "model": "null", + "user": "null" +} +``` + +```sh curl +curl -X POST \ + -H "Content-Type: application/json"\ + -d '{"inputs": "test input"}' \ + --url http:///v1/embeddings +``` + +```py python +import requests + +endpoint = "" + +requests.post(f"{endpoint}/v1/embeddings", json={ + "inputs": ["test input", "test input 2"] +}); + +## or + +requests.post(f"{endpoint}/v1/embeddings", json={ + "inputs": "test single input" +}); + + +``` + + + + +```json 200 Embeddings +{ + "data": [ + { + "embedding": [ + 0.038483415, + -0.00076982786, + -0.020039458 + ... + ], + "index": 0, + "object": "embedding" + }, + { + "embedding": [ + 0.038483415, + -0.00076982786, + -0.020039458 + ... + ], + "index": 1, + "object": "embedding" + }, + ... + ], + "model": "thenlper/gte-base", + "object": "list", + "usage": { + "prompt_tokens": 512, + "total_tokens": 512 + } +} +``` + +```json 413 +{ + "error": "Batch size error", + "error_type": "validation" +} +``` + +```json 422 +{ + "error": "Tokenization error", + "error_type": "validation" +} +``` + +```json 424 +{ + "error": "Inference failed", + "error_type": "backend" +} +``` + +```json 429 +{ + "error": "Model is overloaded", + "error_type": "overloaded" +} +``` + + + + + + Inputs that need to be embedded + + + + + + +The name of the prompt that should be used by for encoding. If not set, no prompt will be applied. + +Must be a key in the `sentence-transformers` configuration prompts dictionary. + +For example if `prompt_name` is **"doc"** then the sentence **"How to get fast inference?"** will be encoded as **"doc: How to get fast inference?"** because the prompt text will be prepended before any text to encode. + + + +Automatically truncate inputs that are longer than the maximum supported size + + + + + diff --git a/vector-inference/rerank.mdx b/vector-inference/rerank.mdx new file mode 100644 index 0000000..14ed536 --- /dev/null +++ b/vector-inference/rerank.mdx @@ -0,0 +1,42 @@ +--- +title: "Working with Reranker" +--- + +## What is a Reranker / CrossEncoder? + +`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion + +## Using with Trieve Vector Inference + + + +To use a reranker model with Trieve Vector Inference, you will need to update your embedding_models.yaml file + +```yaml embedding_models.yaml +... +models: + ... + my-reranker-model: + replicas: 1 + revision: main + modelName: BAAI/bge-reranker-large +... +``` + + + +Update TVF to include your models + +```bash +helm upgrade -i vector-inference \ + oci://registry-1.docker.io/trieve/embeddings-helm \ + -f embedding_models.yaml +``` + + + +```sh +kubectl get ing +``` + + diff --git a/vector-inference/reranker.mdx b/vector-inference/reranker.mdx new file mode 100644 index 0000000..1e1e6a2 --- /dev/null +++ b/vector-inference/reranker.mdx @@ -0,0 +1,140 @@ +--- +title: 'Get ranks' +sidebarTitle: 'POST /rerank' +description: 'Runs Reranker. Returns a 424 status code if the model is not a Reranker model' +--- + + + +```json Raw Json +{ + "query": "What are some good electric cars", + "texts": [ + "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...", + "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..." + ], + "raw_scores": false, + "return_text": false, + "truncate": false, + "truncation_direction": "right" +} +``` + +```sh curl +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "query": "What are some good electric cars", + "texts": [ + "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...", + "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..." + ], + "raw_scores": false, + "return_text": false, + "truncate": false, + "truncation_direction": "right" + }' \ + --url http:///rerank +``` + +```py python +import requests + +endpoint = "" + +requests.post(f"{endpoint}/rerank", json={ + "query": "What are some good electric cars", + "texts": [ + "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ...", + "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..." + ], + "raw_scores": False, + "return_text": False, + "truncate": False, + "truncation_direction": "right" +}); + +## or + +requests.post(f"{endpoint}/rerank", json={ + "inputs": "test single input" +}); + + +``` + + + + +```json 200 Ranks +[ + { + "index":1, + "score":0.15253653, + // if return_text = true + "text": "The Tesla Cybertruck is an all-electric, battery-powered light-duty truck unveiled by Tesla, Inc. ..." + }, + { + "index":0, + "score":0.00498227 + // if return_text = true + "text": "Here’s the information about the Mercedes CLR GTR: The Mercedes CLR GTR is a remarkable racing car ..." + } +] +``` + +```json 413 +{ + "error": "Batch size error", + "error_type": "validation" +} +``` + +```json 422 +{ + "error": "Tokenization error", + "error_type": "validation" +} +``` + +```json 424 +{ + "error": "Inference failed", + "error_type": "backend" +} +``` + +```json 429 +{ + "error": "Model is overloaded", + "error_type": "overloaded" +} +``` + + + + + + Inputs that need to be embedded + + + + Inputs that need to be embedded + + + + Output the raw reranker score or the normalized score between 0-1. + When `false`, score is between 0 and 1, otherwise range is indeterminate + + + + Return the text with along with each rank + + + +Automatically truncate inputs that are longer than the maximum supported size + + + + + diff --git a/vector-inference/splade.mdx b/vector-inference/splade.mdx new file mode 100644 index 0000000..4a74b97 --- /dev/null +++ b/vector-inference/splade.mdx @@ -0,0 +1,47 @@ +--- +title: "Working with Splade v2" +--- + +## What is splade? + +`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion + +## Using with Trieve Vector Inference + + + +To use splade with Trieve Vector Inference, you will need to adapt both the `doc` and `query` models + +The splade `document` model is the model you use to encode files, where the `query` model is the one to encode the query that you will be searching with + +```yaml embedding_models.yaml +models: + # ... + spladeDoc: + replicas: 1 + modelName: naver/efficient-splade-VI-BT-large-doc + isSplade: true + spladeQuery: + replicas: 1 + modelName: naver/efficient-splade-VI-BT-large-query + isSplade: true + # ... +``` + + + +Update TVF to include your models + +```bash +helm upgrade -i vector-inference \ + oci://registry-1.docker.io/trieve/embeddings-helm \ + -f embedding_models.yaml +``` + + + +```sh +kubectl get ing +``` + + diff --git a/vector-inference/working-with-reranker.mdx b/vector-inference/working-with-reranker.mdx new file mode 100644 index 0000000..e69de29 From 0cf6ce6a0dc51dffd46d5c6e20fab45a7fd620b1 Mon Sep 17 00:00:00 2001 From: cdxker Date: Mon, 16 Sep 2024 10:52:23 -0700 Subject: [PATCH 2/7] fix: spell checks and formatting --- vector-inference/aws-installation.mdx | 23 +++++++++++++++------- vector-inference/embed_sparse.mdx | 4 ++-- vector-inference/openai_compat.mdx | 6 +++--- vector-inference/working-with-reranker.mdx | 0 4 files changed, 21 insertions(+), 12 deletions(-) delete mode 100644 vector-inference/working-with-reranker.mdx diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx index 0d539d8..b07d6e2 100644 --- a/vector-inference/aws-installation.mdx +++ b/vector-inference/aws-installation.mdx @@ -61,7 +61,13 @@ export GPU_INSTANCE_TYPE=g4dn.xlarge export GPU_COUNT=1 ``` -**To use our reccomended defaults** +Disable AWS CLI pagination (optional): + +```sh +export AWS_PAGER="" +``` + +**To use our recommended defaults** ```sh export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)" @@ -70,16 +76,17 @@ export CLUSTER_NAME=trieve-gpu export CPU_INSTANCE_TYPE=t3.small export GPU_INSTANCE_TYPE=g4dn.xlarge export GPU_COUNT=1 +export AWS_PAGER="" ``` ### Create your cluster -Download the bootstrap script +Download the `bootstrap-eks.sh` script ```sh wget cdn.trieve.ai/bootstrap-eks.sh ``` -Run the bootstrap script with bash +Run `bootstrap-eks.sh` with bash ```sh bash bootstrap-eks.sh @@ -91,7 +98,7 @@ This will take around 25 minutes to complete ### Configure `embedding_models.yaml` -Frist download the example configiguration file +First download the example configuration file ```sh wget https://cdn.trieve.ai/embedding_models.yaml @@ -150,7 +157,7 @@ vector-inference-embedding-spladequery-ingress alb * k8s-default-ve Each `ingress` point will be using their own Application Load Balancer within AWS. The `Address` provided is the model's endpoint that you can make [dense embeddings](/vector-inference/embed), [sparse embeddings](/vector-inference/embed_sparse), or [reranker calls](/vector-inference/reranker) based on the models you chose -Check out the guides for more information on configureation +Check out the guides for more information on configuration @@ -167,8 +174,10 @@ Check out the guides for more information on configureation ## Optional: Delete the cluster ```sh -cluster_name=trieve-gpu -region=us-east-2 +CLUSTER_NAME=trieve-gpu +REGION=us-east-2 + +aws eks update-kubeconfig --region ${REGION} --name ${CLUSTER_NAME} helm uninstall vector-release helm uninstall nvdp -n kube-system diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx index 44a7c98..2ceee6a 100644 --- a/vector-inference/embed_sparse.mdx +++ b/vector-inference/embed_sparse.mdx @@ -1,11 +1,11 @@ --- title: 'Create Sparse Embedding' sidebarTitle: 'POST /embed_sparse' -description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a splade embedding model' +description: 'Get Sparse Embeddings. Returns a 424 status code if the model is not a Splade embedding model' --- Generating an embedding from a sparse embedding model. -The main one that we support right now are the splade models +The main one that we support right now are the Splade models diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx index a567dd5..00e0430 100644 --- a/vector-inference/openai_compat.mdx +++ b/vector-inference/openai_compat.mdx @@ -20,7 +20,7 @@ Generating an embedding from a dense embedding model ```sh curl curl -X POST \ -H "Content-Type: application/json"\ - -d '{"inputs": "test input"}' \ + -d '{"input": "test input"}' \ --url http:///v1/embeddings ``` @@ -30,13 +30,13 @@ import requests endpoint = "" requests.post(f"{endpoint}/v1/embeddings", json={ - "inputs": ["test input", "test input 2"] + "input": ["test input", "test input 2"] }); ## or requests.post(f"{endpoint}/v1/embeddings", json={ - "inputs": "test single input" + "input": "test single input" }); diff --git a/vector-inference/working-with-reranker.mdx b/vector-inference/working-with-reranker.mdx deleted file mode 100644 index e69de29..0000000 From d173b3b9a6199d15ae231a4271cda01d52d02d81 Mon Sep 17 00:00:00 2001 From: cdxker Date: Mon, 16 Sep 2024 10:52:38 -0700 Subject: [PATCH 3/7] docs: openai copmatiblity guide --- vector-inference/openai.mdx | 38 ++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx index f22e933..e2a0c3c 100644 --- a/vector-inference/openai.mdx +++ b/vector-inference/openai.mdx @@ -1,3 +1,39 @@ --- -title: "OpenAI compatible interface" +title: "Using OpenAI SDK" --- + +Trieve Vector Inference is compatible with the OpenAI api. This means you're able to just replace the endpoint, without changing any pre-existing code. +Here's an example with the `openai` python sdk + +First install the dependencies + +```sh +pip install openai +pip install requests +pip install python-dotenv +``` + +```python openai_compatibility.py +import openai +import time +import requests +import os +from dotenv import load_dotenv + +load_dotenv() + +endpoint = "http://" + +openai.base_url = endpoint + +client = openai.OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url=endpoint +) + +embedding = client.embeddings.create( + input="This is some example input", + model="BAAI/bge-m3" +) +``` From 217c570f6a148bf3f633d22c6881f7cb252e73e8 Mon Sep 17 00:00:00 2001 From: cdxker Date: Tue, 17 Sep 2024 00:10:45 -0700 Subject: [PATCH 4/7] docs: added benchmarks --- vector-inference/introduction.mdx | 57 +++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx index c450ed3..b205c7e 100644 --- a/vector-inference/introduction.mdx +++ b/vector-inference/introduction.mdx @@ -1,8 +1,58 @@ --- title: Introduction -description: Trieve Vector Inference is an on-prem solution for fast vector infernece +description: Trieve Vector Inference is an on-prem solution for fast vector inference --- +## Inspiration + +SaSS offerings for text embeddings have 2 major issues: +1) They have higher latency, due to batch processing. +2) They have heavy rate limits. + +Trieve Vector Inference was created so you could Host Dedicated embedding servers in your own cloud. + +## Performance Difference + +Benchmarks ran using [wrk2](https://github.com/giltene/wrk2) over 30 seconds on 12 threads and 40 active connections. + +Machine used to test was on `m5.large` in `us-west-1`. + + + + +| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|-------------|----------|----------| +| P50 Latency | 193.15 ms | 179.33 ms | 19.06 ms | 14.69 ms | 21.36 ms | +| P90 Latency | 261.25 ms | 271.87 ms | 23.09 ms | 16.90 ms | 29.81 ms | +| P99 Latency | 621.05 ms | 402.43 ms | 24.27 ms | 18.80 ms | 30.29 ms | +| Requests Made | 324 | 324 | 324 | 324 | 324 | +| Requests Failed | 0 | 0 | 0 | 0 | 0 | + + + +| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|-------------|----------|----------| +| P50 Latency | 180.74 ms | 182.62 ms | 16.48 ms | 14.35 ms | 23.22 ms | +| P90 Latency | 222.34 ms | 262.65 ms | 20.70 ms | 16.15 ms | 29.71 ms | +| P99 Latency | 1.11 sec | 363.01 ms | 22.82 ms | 19.82 ms | 31.07 ms | +| Requests Made | 2,991 | 2,991 | 3,015 | 3,024 | 3,024 | +| Requests Failed | 0 | 2,986 | 0 | 0 | 0 | + + + +| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|-------------|-----------|----------| +| P50 Latency | 15.70 sec | 15.82 sec | 24.40 ms | 14.86 ms | 23.74 ms | +| P90 Latency | 22.01 sec | 21.91 sec | 25.14 ms | 17.81 ms | 31.74 ms | +| P99 Latency | 23.59 sec | 23.12 sec | 27.61 ms | 19.52 ms | 34.11 ms | +| Requests Made | 6,234 | 6,771 | 30,002 | 30,002 | 30,001 | +| Requests Failed | 0 | 6,711 | 0 | 0 | 0 | + + + + +\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM) + ## Quick Start @@ -22,8 +72,3 @@ description: Trieve Vector Inference is an on-prem solution for fast vector infe Check out the API Reference to see all of the available endpoints for Trieve Vector Inference - - -## Performance Difference - -Refer to our benchmarks From 6064c544d803ab1cb6970583de96d0cc56217b77 Mon Sep 17 00:00:00 2001 From: cdxker Date: Tue, 17 Sep 2024 13:46:50 -0700 Subject: [PATCH 5/7] docs: added trouble shooting guide and pricing --- vector-inference/pricing.mdx | 64 ++++++++++++++++++++++++++++ vector-inference/troubleshooting.mdx | 51 ++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 vector-inference/pricing.mdx create mode 100644 vector-inference/troubleshooting.mdx diff --git a/vector-inference/pricing.mdx b/vector-inference/pricing.mdx new file mode 100644 index 0000000..8966037 --- /dev/null +++ b/vector-inference/pricing.mdx @@ -0,0 +1,64 @@ +--- +title: Pricing +description: The pricing design Trieve Vector Inference +mode: wide +icon: money-bill +--- + +Trieve Vector Inference is meant to be an on-prem solution a license is needed for use. + +To obtain a license for Trieve Vector Inference contact us: + +- Email us at humans@trieve.ai +- [book a meeting](https://cal.com/nick.k/meet) +- Call us @ 628-222-4090 + + + + +
+
+

$0*

+ per month +
+ +
+
Hosting License
+
Unlimited Clusters
+
+
+
+ + +
+
+

$500

+ per month +
+ +
+
Hosting License
+
Unlimited Clusters
+
Dedicated Slack Support
+
+ +
+
+ + +
+
+

$1000+

+ per month +
+ +
+
Hosting License
+
Unlimited Clusters
+
Dedicated Slack Support
+
99.9% SLA
+
Managed and hosted by Trieve
+
+
+ +\* Free for < 10 employees or Pre-seed diff --git a/vector-inference/troubleshooting.mdx b/vector-inference/troubleshooting.mdx new file mode 100644 index 0000000..355b91b --- /dev/null +++ b/vector-inference/troubleshooting.mdx @@ -0,0 +1,51 @@ +--- +title: Troubleshooting +icon: 'triangle-exclamation' +description: 'Common issues with self hosting' +--- + +There are a lot of moving parts in `eksctl`. Here’s a list of common issues we’ve seen customers run into: + + + + + This error happens when deleting the cluster and some pods in `kube-system` refuse to stop. + To fix this run the following command and the deletion process should be able to proceed. + + ```sh + kubectl get pods -n kube-system -o NAME | xargs kubectl -n kube-system delete + ``` + + + + This happens when the cluster doesn't properly delete load balancers, to fix this + + + + Run this to get the available load balancers + ```sh + kubectl get ingress + ``` + + + The output should look like this + ``` + NAME CLASS HOSTS ADDRESS PORTS AGE + vector-inference-embedding-bgem3-ingress alb * k8s-default-vectorin-25e84e25f0-1362792264.us-east-2.elb.amazonaws.com 80 3d19h + vector-inference-embedding-nomic-ingress alb * k8s-default-vectorin-eb664ce6e9-238019709.us-east-2.elb.amazonaws.com 80 2d20h + vector-inference-embedding-spladedoc-ingress alb * k8s-default-vectorin-8af81ad2bd-192706382.us-east-2.elb.amazonaws.com 80 3d19h + ``` + + + + + Go to EC2 > LoadBalancers ([link](https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#LoadBalancers:v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false)) and delete the alb's that have the ingress point names + + + + + The delete script should be able to resume + + + + From 16746df3b819f1e021e44f026c6710b9391a69cb Mon Sep 17 00:00:00 2001 From: cdxker Date: Tue, 17 Sep 2024 13:47:19 -0700 Subject: [PATCH 6/7] cleanup: cleaned up icons and extra descriptions --- mint.json | 6 ++- vector-inference/aws-installation.mdx | 9 +++- vector-inference/dense.mdx | 3 ++ vector-inference/embed.mdx | 2 +- vector-inference/embed_all.mdx | 2 +- vector-inference/embed_sparse.mdx | 2 +- vector-inference/introduction.mdx | 11 +++-- vector-inference/openai.mdx | 70 +++++++++++++++------------ vector-inference/openai_compat.mdx | 2 +- vector-inference/rerank.mdx | 15 +++++- vector-inference/reranker.mdx | 2 +- vector-inference/splade.mdx | 20 +++++++- 12 files changed, 94 insertions(+), 50 deletions(-) diff --git a/mint.json b/mint.json index 7f9ddd0..b8404d5 100644 --- a/mint.json +++ b/mint.json @@ -71,7 +71,8 @@ "getting-started/quickstart", "getting-started/trieve-primitives", "getting-started/screenshots", - "vector-inference/introduction" + "vector-inference/introduction", + "vector-inference/pricing" ] }, { @@ -81,7 +82,8 @@ "self-hosting/local-kube", "self-hosting/aws", "self-hosting/gcp", - "vector-inference/aws-installation" + "vector-inference/aws-installation", + "vector-inference/troubleshooting" ] }, { diff --git a/vector-inference/aws-installation.mdx b/vector-inference/aws-installation.mdx index b07d6e2..ed82233 100644 --- a/vector-inference/aws-installation.mdx +++ b/vector-inference/aws-installation.mdx @@ -1,6 +1,6 @@ --- title: 'AWS Installation' -description: 'Install Trieve Vector Inference' +description: 'Install Trieve Vector Inference in your own aws account' icon: 'aws' --- @@ -15,7 +15,12 @@ You'll also need a license to run Trieve Vector Inference ### Getting your license -(contact us here) +Contact us: +- Email us at humans@trieve.ai +- [book a meeting](https://cal.com/nick.k/meet) +- Call us @ 628-222-4090 + +Our pricing is [here](/vector-inference/pricing) ## Check AWS quota diff --git a/vector-inference/dense.mdx b/vector-inference/dense.mdx index b476c3d..00cce87 100644 --- a/vector-inference/dense.mdx +++ b/vector-inference/dense.mdx @@ -1,5 +1,8 @@ --- title: 'Using Custom Models' +icon: brackets-curly +description: How to use gated or private models hosted on huggingface +mode: wide --- ## Custom or fine tuned models in Trieve Vector Inference diff --git a/vector-inference/embed.mdx b/vector-inference/embed.mdx index 07248bc..5cb4153 100644 --- a/vector-inference/embed.mdx +++ b/vector-inference/embed.mdx @@ -22,7 +22,7 @@ Generating an embedding from a dense embedding model curl -X POST \ -H "Content-Type: application/json"\ -d '{"inputs": "test input"}' \ - --url http:///embed + --url "http://$ENDPOINT/embed" ``` ```py python diff --git a/vector-inference/embed_all.mdx b/vector-inference/embed_all.mdx index 5e37c47..2d46cc0 100644 --- a/vector-inference/embed_all.mdx +++ b/vector-inference/embed_all.mdx @@ -21,7 +21,7 @@ Generating an embedding from a dense embedding model curl -X POST \ -H "Content-Type: application/json"\ -d '{"inputs": "test input"}' \ - --url http:///embed_all + --url http://$ENDPOINT/embed_all ``` ```py python diff --git a/vector-inference/embed_sparse.mdx b/vector-inference/embed_sparse.mdx index 2ceee6a..59d18b9 100644 --- a/vector-inference/embed_sparse.mdx +++ b/vector-inference/embed_sparse.mdx @@ -22,7 +22,7 @@ The main one that we support right now are the Splade models curl -X POST \ -H "Content-Type: application/json"\ -d '{"inputs": "test input"}' \ - --url http:///embed_sparse + --url http://$ENDPOINT/embed_sparse ``` ```py python diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx index b205c7e..d308707 100644 --- a/vector-inference/introduction.mdx +++ b/vector-inference/introduction.mdx @@ -1,6 +1,7 @@ --- title: Introduction description: Trieve Vector Inference is an on-prem solution for fast vector inference +icon: rocket --- ## Inspiration @@ -51,24 +52,24 @@ Machine used to test was on `m5.large` in `us-west-1`. -\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM) +\* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM for premium plan) -## Quick Start +## See more Adding Trieve Vector Inference into your AWS account - + Using the `/embed` route - + Check out the API Reference to see all of the available endpoints for Trieve Vector Inference - + Check out the API Reference to see all of the available endpoints for Trieve Vector Inference diff --git a/vector-inference/openai.mdx b/vector-inference/openai.mdx index e2a0c3c..778df13 100644 --- a/vector-inference/openai.mdx +++ b/vector-inference/openai.mdx @@ -1,39 +1,45 @@ --- title: "Using OpenAI SDK" +icon: microchip-ai +description: How to integrate TVI with existing openai compatible endpoints --- Trieve Vector Inference is compatible with the OpenAI api. This means you're able to just replace the endpoint, without changing any pre-existing code. Here's an example with the `openai` python sdk -First install the dependencies - -```sh -pip install openai -pip install requests -pip install python-dotenv -``` - -```python openai_compatibility.py -import openai -import time -import requests -import os -from dotenv import load_dotenv - -load_dotenv() - -endpoint = "http://" - -openai.base_url = endpoint - -client = openai.OpenAI( - # This is the default and can be omitted - api_key=os.environ.get("OPENAI_API_KEY"), - base_url=endpoint -) - -embedding = client.embeddings.create( - input="This is some example input", - model="BAAI/bge-m3" -) -``` + + + ```sh + pip install openai requests python-dotenv + ``` + + + + Replace `base_url` with your embeddding endpoint. + + ```python openai_compatibility.py + import openai + import time + import requests + import os + from dotenv import load_dotenv + + load_dotenv() + + endpoint = "http://" + + openai.base_url = endpoint + + client = openai.OpenAI( + # This is the default and can be omitted + api_key=os.environ.get("OPENAI_API_KEY"), + base_url=endpoint + ) + + embedding = client.embeddings.create( + input="This is some example input", + model="BAAI/bge-m3" + ) + ``` + + diff --git a/vector-inference/openai_compat.mdx b/vector-inference/openai_compat.mdx index 00e0430..76bc121 100644 --- a/vector-inference/openai_compat.mdx +++ b/vector-inference/openai_compat.mdx @@ -21,7 +21,7 @@ Generating an embedding from a dense embedding model curl -X POST \ -H "Content-Type: application/json"\ -d '{"input": "test input"}' \ - --url http:///v1/embeddings + --url http://$ENDPOINT/v1/embeddings ``` ```py python diff --git a/vector-inference/rerank.mdx b/vector-inference/rerank.mdx index 14ed536..fed542d 100644 --- a/vector-inference/rerank.mdx +++ b/vector-inference/rerank.mdx @@ -1,12 +1,14 @@ --- title: "Working with Reranker" +mode: wide +icon: arrow-up-arrow-down --- ## What is a Reranker / CrossEncoder? -`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion +A `Reranker` model provides a powerful semantic boost to the search quality of any keyword or vector search system without requiring any overhaul or replacement. -## Using with Trieve Vector Inference +## Using Rerankers with Trieve Vector Inference @@ -38,5 +40,14 @@ helm upgrade -i vector-inference \ ```sh kubectl get ing ``` + +``` +NAME CLASS HOSTS ADDRESS PORTS AGE +vector-inference-embedding-bge-reranker-ingress alb * k8s-default-vectorin-b09efe8cf6-890425945.us-west-1.elb.amazonaws.com 80 77m +``` + +The output looks like this + + diff --git a/vector-inference/reranker.mdx b/vector-inference/reranker.mdx index 1e1e6a2..023f723 100644 --- a/vector-inference/reranker.mdx +++ b/vector-inference/reranker.mdx @@ -34,7 +34,7 @@ curl -X POST \ "truncate": false, "truncation_direction": "right" }' \ - --url http:///rerank + --url http://$ENDPOINT/rerank ``` ```py python diff --git a/vector-inference/splade.mdx b/vector-inference/splade.mdx index 4a74b97..a28855d 100644 --- a/vector-inference/splade.mdx +++ b/vector-inference/splade.mdx @@ -1,12 +1,15 @@ --- title: "Working with Splade v2" +icon: magnifying-glass +description: Learn how to use splade with TVI. +mode: wide --- ## What is splade? -`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes term expansion +`Splade` is similar to other inverted index approaches like `bm25`. `Splade` includes neural term expansion, meaning that it is able to match on synonym's much better than traditional bm25 -## Using with Trieve Vector Inference +## Using Splade with Trieve Vector Inference @@ -44,4 +47,17 @@ helm upgrade -i vector-inference \ kubectl get ing ``` + + + ```sh + ENDPOINT="k8s-default-vectorin...elb.amazonaws.com" + + curl -X POST \ + -H "Content-Type: application/json"\ + -d '{"inputs": "test input"}' \ + --url http://$ENDPOINT/embed_sparse + ``` + + For more information checkout the [API reference](/vector-inference/embed_sparse) for sparse vectors + From 9f40a28b80001a69ad761634492e84b71fff64c8 Mon Sep 17 00:00:00 2001 From: cdxker Date: Thu, 19 Sep 2024 23:19:02 -0700 Subject: [PATCH 7/7] docs: add SageMaker metrics with wrk2 benchmark --- vector-inference/introduction.mdx | 44 ++++++++++++++++--------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/vector-inference/introduction.mdx b/vector-inference/introduction.mdx index d308707..b0ddc66 100644 --- a/vector-inference/introduction.mdx +++ b/vector-inference/introduction.mdx @@ -21,39 +21,41 @@ Machine used to test was on `m5.large` in `us-west-1`. -| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | -|-------------|---------------|----------------|-------------|----------|----------| -| P50 Latency | 193.15 ms | 179.33 ms | 19.06 ms | 14.69 ms | 21.36 ms | -| P90 Latency | 261.25 ms | 271.87 ms | 23.09 ms | 16.90 ms | 29.81 ms | -| P99 Latency | 621.05 ms | 402.43 ms | 24.27 ms | 18.80 ms | 30.29 ms | -| Requests Made | 324 | 324 | 324 | 324 | 324 | -| Requests Failed | 0 | 0 | 0 | 0 | 0 | +| | OPENAI Cloud | JINA AI Cloud* | JINA (SageMaker)** | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|---------------------|-------------|----------|----------| +| P50 Latency | 193.15 ms | 179.33 ms | 185.21 ms | 19.06 ms | 14.69 ms | 21.36 ms | +| P90 Latency | 261.25 ms | 271.87 ms | 296.19 ms | 23.09 ms | 16.90 ms | 29.81 ms | +| P99 Latency | 621.05 ms | 402.43 ms | 306.94 ms | 24.27 ms | 18.80 ms | 30.29 ms | +| Requests Made | 324 | 324 | 324 | 324 | 324 | 324 | +| Requests Failed | 0 | 0 | 3 | 0 | 0 | 0 | -| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | -|-------------|---------------|----------------|-------------|----------|----------| -| P50 Latency | 180.74 ms | 182.62 ms | 16.48 ms | 14.35 ms | 23.22 ms | -| P90 Latency | 222.34 ms | 262.65 ms | 20.70 ms | 16.15 ms | 29.71 ms | -| P99 Latency | 1.11 sec | 363.01 ms | 22.82 ms | 19.82 ms | 31.07 ms | -| Requests Made | 2,991 | 2,991 | 3,015 | 3,024 | 3,024 | -| Requests Failed | 0 | 2,986 | 0 | 0 | 0 | +| | OPENAI Cloud | JINA AI Cloud* | JINA (SageMaker)** | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|---------------------|-------------|----------|----------| +| P50 Latency | 180.74 ms | 182.62 ms | 515.84 ms | 16.48 ms | 14.35 ms | 23.22 ms | +| P90 Latency | 222.34 ms | 262.65 ms | 654.85 ms | 20.70 ms | 16.15 ms | 29.71 ms | +| P99 Latency | 1.11 sec | 363.01 ms | 724.48 ms | 22.82 ms | 19.82 ms | 31.07 ms | +| Requests Made | 2,991 | 2,991 | 2963 | 3,015 | 3,024 | 3,024 | +| Requests Failed | 0 | 2,986 | 0 | 0 | 0 | 0 | -| | OPENAI Cloud | JINA AI Cloud* | TVI Jina | TVI BGE-M3 | TVI Nomic | -|-------------|---------------|----------------|-------------|-----------|----------| -| P50 Latency | 15.70 sec | 15.82 sec | 24.40 ms | 14.86 ms | 23.74 ms | -| P90 Latency | 22.01 sec | 21.91 sec | 25.14 ms | 17.81 ms | 31.74 ms | -| P99 Latency | 23.59 sec | 23.12 sec | 27.61 ms | 19.52 ms | 34.11 ms | -| Requests Made | 6,234 | 6,771 | 30,002 | 30,002 | 30,001 | -| Requests Failed | 0 | 6,711 | 0 | 0 | 0 | +| | OPENAI Cloud | JINA AI Cloud* | JINA (SageMaker)** | TVI Jina | TVI BGE-M3 | TVI Nomic | +|-------------|---------------|----------------|---------------------|-------------|-----------|----------| +| P50 Latency | 15.70 sec | 15.82 sec | 17.97 sec | 24.40 ms | 14.86 ms | 23.74 ms | +| P90 Latency | 22.01 sec | 21.91 sec | 25.30 sec | 25.14 ms | 17.81 ms | 31.74 ms | +| P99 Latency | 23.59 sec | 23.12 sec | 27.03 sec | 27.61 ms | 19.52 ms | 34.11 ms | +| Requests Made | 6,234 | 6,771 | 2963 | 30,002 | 30,002 | 30,001 | +| Requests Failed | 0 | 6,711 | 0 | 0 | 0 | 0 | \* Failed requests was when rate limiting hit in (Jina AI rate limit is 60 RPM or 300 RPM for premium plan) +\** `jina-embeddings-v2-base-en` on Sagemaker with `ml.g4dn.xlarge` + ## See more