-
Notifications
You must be signed in to change notification settings - Fork 1
/
browse.py
124 lines (86 loc) · 4.05 KB
/
browse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from chat import Chat
async def scrape_text(url):
async with async_playwright() as p:
browser = await p.chromium.launch()
# Set up a more complete user agent and viewport size
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
viewport={"width": 1280, "height": 800},
)
page = await context.new_page()
try:
await page.goto(url)
# Add delay or wait for specific elements if necessary
await asyncio.sleep(2)
content = await page.content()
soup = BeautifulSoup(content, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
finally:
await browser.close()
async def scrape_links(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# Check if the response contains an HTTP error
if response.status >= 400:
return "error"
text = await response.text()
soup = BeautifulSoup(text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
return format_hyperlinks(hyperlinks)
def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all("a", href=True):
hyperlinks.append((link.text, link["href"]))
return hyperlinks
def format_hyperlinks(hyperlinks):
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return formatted_links
def split_text(text, max_length=3000):
paragraphs = text.split("\n")
current_length = 0
current_chunk = []
for paragraph in paragraphs:
if current_length + len(paragraph) + 1 <= max_length:
current_chunk.append(paragraph)
current_length += len(paragraph) + 1
else:
yield "\n".join(current_chunk)
current_chunk = [paragraph]
current_length = len(paragraph) + 1
if current_chunk:
yield "\n".join(current_chunk)
def create_message(chunk, question):
return f'"""{chunk}""" Using the above text, please answer the following question: "{question}" -- if the question cannot be answered using the text, please summarize the text.'
async def summarize_text(text, question, verbose=True):
if not text:
return "Error: No text to summarize"
get_summaries = []
chunks = list(split_text(text))
for chunk in chunks:
chat = Chat(model_name="gpt-3.5-turbo", max_tokens=300, verbose=verbose)
message = f'"""{chunk}""" Using the above text, please extract any relevant information (texts, links, image links, etc.) to address the following question: "{question}" -- if the question cannot be answered using the text, please summarize the text. Use bulletpoints'
create_message(chunk, question)
chat.add_user_message(message)
get_sumary = chat.get_chat_response()
get_summaries.append(get_sumary)
summaries = await asyncio.gather(*get_summaries)
combined_summary = "\n".join(summaries)
chat = Chat(model_name="gpt-3.5-turbo", verbose=verbose)
message = f'{combined_summary}\n\nThe above text contains extracted details of a website. For the question "{question}", please provide a comprehensive markdown response, including appropriate citations and extracting any relevant links for further investigation.'
chat.add_user_message(message)
final_summary = await chat.get_chat_response()
return final_summary