Add SearchApi search engine (leptonai#23)

FrizzleFur · Jan 30, 2024 · 4324365 · 4324365
1 parent 420aa69
commit 4324365
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ There are two default supported search engines: Bing and Google.
 To use the Bing Web Search API, please visit [this link](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) to obtain your Bing subscription key.
 
 ### Google Search
-You have two options for Google Search: you can use the [Google Search API](https://www.serper.dev) from Serper or opt for the [Programmable Search Engine](https://developers.google.com/custom-search) provided by Google.
+You have three options for Google Search: you can use the [SearchApi Google Search API](https://www.searchapi.io/) from SearchApi, [Serper Google Search API](https://www.serper.dev) from Serper, or opt for the [Programmable Search Engine](https://developers.google.com/custom-search) provided by Google.
 
 ## Setup LLM and KV
 
@@ -49,6 +49,12 @@ cd web && npm install && npm run build
 BACKEND=BING python search_with_lepton.py
 ```
 
+For Google Search using SearchApi:
+```shell
+export SEARCHAPI_API_KEY=YOUR_SEARCHAPI_API_KEY
+BACKEND=SEARCHAPI python search_with_lepton.py
+```
+
 For Google Search using Serper:
 ```shell
 export SERPER_SEARCH_API_KEY=YOUR_SERPER_API_KEY

diff --git a/lepton_template/README.md b/lepton_template/README.md
@@ -17,6 +17,10 @@ If you are using Bing, you can subscribe to the bing search api [here](https://w
 
 If you choose to use Google, you can follow the instructions [here](https://developers.google.com/custom-search/v1/overview) to get your Google search api key. We follow the convention and name it `GOOGLE_SEARCH_API_KEY`. We recommend you store the key as a secret in Lepton. You will also get a search engine CX id, which you will need as well.
 
+### SearchApi
+
+If you want to use SearchApi, a 3rd party Google Search API, you can retrieve the API key by registering [here](https://www.searchapi.io/). We follow the convention and name it `SEARCHAPI_API_KEY`. We recommend you store the key as a secret in Lepton.
+
 ### Lepton Demo API
 
 If you choose to use the lepton demo api, you don't need to do anything - your workspace credential will give you access to the demo api. Note that this does incur an API call cost.
@@ -30,7 +34,7 @@ Here are the configurations you can set for your deployment:
 
 Then, set the following environmental variables.
 
-* `BACKEND`: the search backend to use. If you don't have bing or google set up, simply use `LEPTON` to try the demo. Otherwise, do `BING` or `GOOGLE`.
+* `BACKEND`: the search backend to use. If you don't have bing or google set up, simply use `LEPTON` to try the demo. Otherwise, do `BING`, `GOOGLE` or `SEARCHAPI`.
 * `LLM_MODEL`: the LLM model to run. We recommend using `mixtral-8x7b`, but if you want to experiment other models, you can try the ones hosted on LeptonAI, for example, `llama2-70b`, `llama2-13b`, `llama2-7b`. Note that small models won't work that well.
 * `KV_NAME`: the Lepton KV to use to store the search results. You can use the default `search-with-lepton`.
 * `RELATED_QUESTIONS`: whether to generate related questions. If you set this to `true`, the search engine will generate related questions for you. Otherwise, it will not.
@@ -41,6 +45,7 @@ In addition, you will need to set the following secrets:
 * `LEPTON_WORKSPACE_TOKEN`: this is required to call Lepton's LLM and KV apis. You can find your workspace token at [Settings](https://dashboard.lepton.ai/workspace-redirect/settings).
 * `BING_SEARCH_V7_SUBSCRIPTION_KEY`: if you are using Bing, you need to specify the subscription key. Otherwise it is not needed.
 * `GOOGLE_SEARCH_API_KEY`: if you are using Google, you need to specify the search api key. Note that you should also specify the cx in the env. If you are not using Google, it is not needed.
+* `SEARCHAPI_API_KEY`: if you are using SearchApi, a 3rd party Google Search API, you need to specify the api key.
 
 Once these fields are set, click `Deploy` button at the bottom of the page to create the deployment. You can see the deployment has now been created under [Deployments](https://dashboard.lepton.ai/workspace-redirect/deployments). Click on the deployment name to check the details. You’ll be able to see the deployment URL and status on this page.
 

diff --git a/search_with_lepton.py b/search_with_lepton.py
@@ -30,6 +30,7 @@
 BING_MKT = "en-US"
 GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
 SERPER_SEARCH_ENDPOINT = "https://google.serper.dev/search"
+SEARCHAPI_SEARCH_ENDPOINT = "https://www.searchapi.io/api/v1/search"
 
 # Specify the number of references from the search engine you want to use.
 # 8 is usually a good number.
@@ -197,6 +198,96 @@ def search_with_serper(query: str, subscription_key: str):
         logger.error(f"Error encountered: {json_content}")
         return []
 
+def search_with_searchapi(query: str, subscription_key: str):
+    """
+    Search with SearchApi.io and return the contexts.
+    """
+    payload = {
+        "q": query,
+        "engine": "google",
+        "num": (
+            REFERENCE_COUNT
+            if REFERENCE_COUNT % 10 == 0
+            else (REFERENCE_COUNT // 10 + 1) * 10
+        ),
+    }
+    headers = {"Authorization": f"Bearer {subscription_key}", "Content-Type": "application/json"}
+    logger.info(
+        f"{payload} {headers} {subscription_key} {query} {SEARCHAPI_SEARCH_ENDPOINT}"
+    )
+    response = requests.get(
+        SEARCHAPI_SEARCH_ENDPOINT,
+        headers=headers,
+        params=payload,
+        timeout=30,
+    )
+    if not response.ok:
+        logger.error(f"{response.status_code} {response.text}")
+        raise HTTPException(response.status_code, "Search engine error.")
+    json_content = response.json()
+    try:
+        # convert to the same format as bing/google
+        contexts = []
+
+        if json_content.get("answer_box"):
+            if json_content["answer_box"].get("organic_result"):
+                title = json_content["answer_box"].get("organic_result").get("title", "")
+                url = json_content["answer_box"].get("organic_result").get("link", "")
+            if json_content["answer_box"].get("type") == "population_graph":
+                title = json_content["answer_box"].get("place", "")
+                url = json_content["answer_box"].get("explore_more_link", "")
+
+            title = json_content["answer_box"].get("title", "")
+            url = json_content["answer_box"].get("link")
+            snippet =  json_content["answer_box"].get("answer") or json_content["answer_box"].get("snippet")
+
+            if url and snippet:
+                contexts.append({
+                    "name": title,
+                    "url": url,
+                    "snippet": snippet
+                })
+
+        if json_content.get("knowledge_graph"):
+            if json_content["knowledge_graph"].get("source"):
+                url = json_content["knowledge_graph"].get("source").get("link", "")
+
+            url = json_content["knowledge_graph"].get("website", "")
+            snippet = json_content["knowledge_graph"].get("description")
+
+            if url and snippet:
+                contexts.append({
+                    "name": json_content["knowledge_graph"].get("title", ""),
+                    "url": url,
+                    "snippet": snippet
+                })
+
+        contexts += [
+            {"name": c["title"], "url": c["link"], "snippet": c.get("snippet", "")}
+            for c in json_content["organic_results"]
+        ]
+
+        if json_content.get("related_questions"):
+            for question in json_content["related_questions"]:
+                if question.get("source"):
+                    url = question.get("source").get("link", "")
+                else:
+                    url = ""  
+
+                snippet = question.get("answer", "")
+
+                if url and snippet:
+                    contexts.append({
+                        "name": question.get("question", ""),
+                        "url": url,
+                        "snippet": snippet
+                    })
+
+        return contexts[:REFERENCE_COUNT]
+    except KeyError:
+        logger.error(f"Error encountered: {json_content}")
+        return []
+
 class RAG(Photon):
     """
     Retrieval-Augmented Generation Demo from Lepton AI.
@@ -251,6 +342,8 @@ class RAG(Photon):
             "GOOGLE_SEARCH_API_KEY",
             # If you use Serper, you need to specify the search api key.
             "SERPER_SEARCH_API_KEY",
+            # If you use SearchApi, you need to specify the search api key.
+            "SEARCHAPI_API_KEY",
             # You need to specify the workspace token to query lepton's LLM models.
             "LEPTON_WORKSPACE_TOKEN",
         ],
@@ -316,8 +409,14 @@ def init(self):
                 query,
                 self.search_api_key,
             )
+        elif self.backend == "SEARCHAPI":
+            self.search_api_key = os.environ["SEARCHAPI_API_KEY"]
+            self.search_function = lambda query: search_with_searchapi(
+                query,
+                self.search_api_key,
+            )
         else:
-            raise RuntimeError("Backend must be LEPTON, BING, GOOGLE or SERPER.")
+            raise RuntimeError("Backend must be LEPTON, BING, GOOGLE, SERPER or SEARCHAPI.")
         self.model = os.environ["LLM_MODEL"]
         # An executor to carry out async tasks, such as uploading to KV.
         self.executor = concurrent.futures.ThreadPoolExecutor(