🏎️ Make content refresher async (reworkd#1184)

* 🏎️ Make content refresher async * 🏎️ Make content refresher async * 🏎️ Make content refresher async
zachshallbetter · Aug 3, 2023 · 03b8543 · 03b8543
1 parent 9e8e49e
commit 03b8543
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,4 @@ yarn-error.log*
 .sentryclirc
 /volumes/
 schema.prismae
+*.sql
diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py
@@ -1,13 +1,13 @@
 import re
 from typing import Any
 
-import anthropic
 import requests
 from bs4 import BeautifulSoup
 from loguru import logger
 from scrapingbee import ScrapingBeeClient
 
 from reworkd_platform.schemas.workflow.base import Block, BlockIOBase
+from reworkd_platform.services.anthropic import ClaudeService, HumanAssistantPrompt
 from reworkd_platform.settings import settings
 
 
@@ -29,10 +29,10 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
         logger.info(f"Starting {self.type}")
         target_url = self.input.url
 
-        target_content = get_page_content(target_url)
+        target_content = await get_page_content(target_url)
         logger.info(target_content)
 
-        keywords = find_content_kws(target_content)
+        keywords = await find_content_kws(target_content)
         logger.info(keywords)
 
         source_urls = search_results(keywords)
@@ -41,23 +41,24 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
         logger.info(source_urls)
 
         source_contents = [
-            get_page_content(url)
+            await get_page_content(url)
             for url in source_urls[:3]  # TODO: remove limit of 3 sources
-        ]  # TODO: async/multithread the LLM calls
+        ]
+
         source_contents = [
             content for content in source_contents if content is not None
         ]
+
         logger.info(source_contents)
+        new_info = [
+            await find_new_info(target_content, source_content)
+            for source_content in source_contents
+        ]
 
-        new_infos = "\n\n".join(
-            [
-                find_new_info(target_content, source_content)
-                for source_content in source_contents
-            ]
-        )
+        new_infos = "\n\n".join(new_info)
         logger.info(new_infos)
 
-        updated_target_content = add_info(target_content, new_infos)
+        updated_target_content = await add_info(target_content, new_infos)
         logger.info(updated_target_content)
 
         return ContentRefresherOutput(
@@ -70,12 +71,13 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
 scraper = ScrapingBeeClient(
     api_key=settings.scrapingbee_api_key,
 )
-claude = anthropic.Anthropic(
+
+claude = ClaudeService(
     api_key=settings.anthropic_api_key,
 )
 
 
-def get_page_content(url: str) -> str:
+async def get_page_content(url: str) -> str:
     page = requests.get(url)
     if page.status_code != 200:
         page = scraper.get(url)
@@ -90,14 +92,17 @@ def get_page_content(url: str) -> str:
         ]
     )
 
-    prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
-    response = claude.completions.create(
-        model="claude-2",
-        prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
+    prompt = HumanAssistantPrompt(
+        human_prompt=f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma.",
+        assistant_prompt="Here are the line numbers of the main content:",
+    )
+
+    line_nums = await claude.completion(
+        prompt=prompt,
         max_tokens_to_sample=500,
         temperature=0,
     )
-    line_nums = response.completion.strip()
+
     if len(line_nums) == 0:
         return ""
 
@@ -116,17 +121,17 @@ def get_page_content(url: str) -> str:
     return "\n".join(content)
 
 
-def find_content_kws(content: str) -> str:
+async def find_content_kws(content: str) -> str:
     # Claude: find search keywords that content focuses on
-    prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
-    response = claude.completions.create(
-        model="claude-2",
-        prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
+    prompt = HumanAssistantPrompt(
+        human_prompt=f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively.",
+        assistant_prompt="Here is a short search query that best matches the content of the article:",
+    )
+
+    return await claude.completion(
+        prompt=prompt,
         max_tokens_to_sample=20,
-        temperature=0,
     )
-    response_message = response.completion.strip()
-    return response_message
 
 
 def search_results(search_query: str) -> list[str]:
@@ -142,33 +147,34 @@ def search_results(search_query: str) -> list[str]:
         },
     )
     response.raise_for_status()
-    search_results = response.json()
-    urls = [result["link"] for result in search_results["organic"]]
+    urls = [result["link"] for result in response.json()["organic"]]
     return urls
 
 
-def find_new_info(target: str, source: str) -> str:
+async def find_new_info(target: str, source: str) -> str:
     # Claude: info mentioned in source that is not mentioned in target
-    prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
-    response = claude.completions.create(
-        model="claude-2",
-        prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
+    prompt = HumanAssistantPrompt(
+        human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.",
+        assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:",
+    )
+
+    response = await claude.completion(
+        prompt=prompt,
         max_tokens_to_sample=5000,
-        temperature=0,
     )
-    response_message = response.completion.strip()
-    new_info = "\n".join(response_message.split("\n\n"))
+
+    new_info = "\n".join(response.split("\n\n"))
     return new_info
 
 
-def add_info(target: str, info: str) -> str:
+async def add_info(target: str, info: str) -> str:
     # Claude: rewrite target to include the info
-    prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
-    response = claude.completions.create(
-        model="claude-2",
-        prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
+    prompt = HumanAssistantPrompt(
+        human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles.",
+        assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
+    )
+
+    return await claude.completion(
+        prompt=prompt,
         max_tokens_to_sample=5000,
-        temperature=0,
     )
-    response_message = response.completion.strip()
-    return response_message
diff --git a/platform/reworkd_platform/services/anthropic.py b/platform/reworkd_platform/services/anthropic.py
@@ -0,0 +1,42 @@
+from typing import Any, Optional
+
+from anthropic import AsyncAnthropic
+from pydantic import BaseModel
+
+
+class AbstractPrompt(BaseModel):
+    def to_string(self) -> str:
+        raise NotImplementedError
+
+
+class HumanAssistantPrompt(AbstractPrompt):
+    assistant_prompt: str
+    human_prompt: str
+
+    def to_string(self) -> str:
+        return (
+            f"""\n\nHuman: {self.human_prompt}\n\nAssistant: {self.assistant_prompt}"""
+        )
+
+
+class ClaudeService:
+    def __init__(self, api_key: Optional[str], model: str = "claude-2"):
+        self.claude = AsyncAnthropic(api_key=api_key)
+        self.model = model
+
+    async def completion(
+        self,
+        prompt: AbstractPrompt,
+        max_tokens_to_sample: int,
+        temperature: int = 0,
+        **kwargs: Any,
+    ) -> str:
+        return (
+            await self.claude.completions.create(
+                model=self.model,
+                prompt=prompt.to_string(),
+                max_tokens_to_sample=max_tokens_to_sample,
+                temperature=temperature,
+                **kwargs,
+            )
+        ).completion.strip()