Skip to content

Commit

Permalink
🏎️ Make content refresher async (reworkd#1184)
Browse files Browse the repository at this point in the history
* 🏎️ Make content refresher async

* 🏎️ Make content refresher async

* 🏎️ Make content refresher async
  • Loading branch information
awtkns authored Aug 3, 2023
1 parent 9e8e49e commit 03b8543
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ yarn-error.log*
.sentryclirc
/volumes/
schema.prismae
*.sql
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import re
from typing import Any

import anthropic
import requests
from bs4 import BeautifulSoup
from loguru import logger
from scrapingbee import ScrapingBeeClient

from reworkd_platform.schemas.workflow.base import Block, BlockIOBase
from reworkd_platform.services.anthropic import ClaudeService, HumanAssistantPrompt
from reworkd_platform.settings import settings


Expand All @@ -29,10 +29,10 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
logger.info(f"Starting {self.type}")
target_url = self.input.url

target_content = get_page_content(target_url)
target_content = await get_page_content(target_url)
logger.info(target_content)

keywords = find_content_kws(target_content)
keywords = await find_content_kws(target_content)
logger.info(keywords)

source_urls = search_results(keywords)
Expand All @@ -41,23 +41,24 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
logger.info(source_urls)

source_contents = [
get_page_content(url)
await get_page_content(url)
for url in source_urls[:3] # TODO: remove limit of 3 sources
] # TODO: async/multithread the LLM calls
]

source_contents = [
content for content in source_contents if content is not None
]

logger.info(source_contents)
new_info = [
await find_new_info(target_content, source_content)
for source_content in source_contents
]

new_infos = "\n\n".join(
[
find_new_info(target_content, source_content)
for source_content in source_contents
]
)
new_infos = "\n\n".join(new_info)
logger.info(new_infos)

updated_target_content = add_info(target_content, new_infos)
updated_target_content = await add_info(target_content, new_infos)
logger.info(updated_target_content)

return ContentRefresherOutput(
Expand All @@ -70,12 +71,13 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput:
scraper = ScrapingBeeClient(
api_key=settings.scrapingbee_api_key,
)
claude = anthropic.Anthropic(

claude = ClaudeService(
api_key=settings.anthropic_api_key,
)


def get_page_content(url: str) -> str:
async def get_page_content(url: str) -> str:
page = requests.get(url)
if page.status_code != 200:
page = scraper.get(url)
Expand All @@ -90,14 +92,17 @@ def get_page_content(url: str) -> str:
]
)

prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
prompt = HumanAssistantPrompt(
human_prompt=f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma.",
assistant_prompt="Here are the line numbers of the main content:",
)

line_nums = await claude.completion(
prompt=prompt,
max_tokens_to_sample=500,
temperature=0,
)
line_nums = response.completion.strip()

if len(line_nums) == 0:
return ""

Expand All @@ -116,17 +121,17 @@ def get_page_content(url: str) -> str:
return "\n".join(content)


def find_content_kws(content: str) -> str:
async def find_content_kws(content: str) -> str:
# Claude: find search keywords that content focuses on
prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
prompt = HumanAssistantPrompt(
human_prompt=f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively.",
assistant_prompt="Here is a short search query that best matches the content of the article:",
)

return await claude.completion(
prompt=prompt,
max_tokens_to_sample=20,
temperature=0,
)
response_message = response.completion.strip()
return response_message


def search_results(search_query: str) -> list[str]:
Expand All @@ -142,33 +147,34 @@ def search_results(search_query: str) -> list[str]:
},
)
response.raise_for_status()
search_results = response.json()
urls = [result["link"] for result in search_results["organic"]]
urls = [result["link"] for result in response.json()["organic"]]
return urls


def find_new_info(target: str, source: str) -> str:
async def find_new_info(target: str, source: str) -> str:
# Claude: info mentioned in source that is not mentioned in target
prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
prompt = HumanAssistantPrompt(
human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.",
assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:",
)

response = await claude.completion(
prompt=prompt,
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
new_info = "\n".join(response_message.split("\n\n"))

new_info = "\n".join(response.split("\n\n"))
return new_info


def add_info(target: str, info: str) -> str:
async def add_info(target: str, info: str) -> str:
# Claude: rewrite target to include the info
prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
prompt = HumanAssistantPrompt(
human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles.",
assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
)

return await claude.completion(
prompt=prompt,
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
return response_message
42 changes: 42 additions & 0 deletions platform/reworkd_platform/services/anthropic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any, Optional

from anthropic import AsyncAnthropic
from pydantic import BaseModel


class AbstractPrompt(BaseModel):
def to_string(self) -> str:
raise NotImplementedError


class HumanAssistantPrompt(AbstractPrompt):
assistant_prompt: str
human_prompt: str

def to_string(self) -> str:
return (
f"""\n\nHuman: {self.human_prompt}\n\nAssistant: {self.assistant_prompt}"""
)


class ClaudeService:
def __init__(self, api_key: Optional[str], model: str = "claude-2"):
self.claude = AsyncAnthropic(api_key=api_key)
self.model = model

async def completion(
self,
prompt: AbstractPrompt,
max_tokens_to_sample: int,
temperature: int = 0,
**kwargs: Any,
) -> str:
return (
await self.claude.completions.create(
model=self.model,
prompt=prompt.to_string(),
max_tokens_to_sample=max_tokens_to_sample,
temperature=temperature,
**kwargs,
)
).completion.strip()

0 comments on commit 03b8543

Please sign in to comment.