Merge branch 'pr/15' into development

Rohithzr · Aug 12, 2024 · c3e165c · c3e165c
2 parents 332236b + ff97efd
commit c3e165c
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 6 deletions.
diff --git a/prompts/agent.tools.md b/prompts/agent.tools.md
@@ -62,6 +62,26 @@ Always verify memory by online.
 }
 ~~~
 
+### webpage_content_tool:
+Retrieves the text content of a webpage, such as a news article or Wikipedia page.
+Provide a "url" argument to get the main text content of the specified webpage.
+This tool is useful for gathering information from online sources.
+Always provide a full, valid URL including the protocol (http:// or https://).
+
+**Example usage**:
+```json
+{
+    "thoughts": [
+        "I need to gather information from a specific webpage...",
+        "I will use the webpage_content_tool to fetch the content...",
+    ],
+    "tool_name": "webpage_content_tool",
+    "tool_args": {
+        "url": "https://en.wikipedia.org/wiki/Artificial_intelligence",
+    }
+}
+```
+
 ### memory_tool:
 Manage long term memories. Allowed arguments are "query", "memorize", "forget" and "delete".
 Memories can help you remember important details and later reuse them.
@@ -193,4 +213,4 @@ When writing own code, ALWAYS put print/log statements inside and at the end of
         "code": "Y",
     }
 }
-~~~
+~~~
diff --git a/python/tools/knowledge_tool.py b/python/tools/knowledge_tool.py
@@ -16,7 +16,7 @@ def execute(self, question="", **kwargs):
         with concurrent.futures.ThreadPoolExecutor() as executor:
             # Schedule the two functions to be run in parallel
 
-            # perplexity search, if API provided
+            # perplexity search, if API key provided
             if os.getenv("API_KEY_PERPLEXITY"):
                 perplexity = executor.submit(perplexity_search.perplexity_search, question)
             else: 
@@ -31,14 +31,14 @@ def execute(self, question="", **kwargs):
             future_memory = executor.submit(memory_tool.search, self.agent, question)
 
             # Wait for both functions to complete
-            perplexity_result = (perplexity.result() if perplexity else "") or ""
+            perplexity_result = (perplexity.result() if perplexity else "")
             duckduckgo_result = duckduckgo.result()
             memory_result = future_memory.result()
 
         msg = files.read_file("prompts/tool.knowledge.response.md", 
-                              online_sources = perplexity_result + "\n\n" + str(duckduckgo_result),
+                              online_sources = ((perplexity_result + "\n\n") if perplexity else "") + str(duckduckgo_result),
                               memory = memory_result )
 
         if self.agent.handle_intervention(msg): pass # wait for intervention and handle it, if paused
 
-        return Response(message=msg, break_loop=False)
+        return Response(message=msg, break_loop=False)
diff --git a/python/tools/webpage_content_tool.py b/python/tools/webpage_content_tool.py
@@ -0,0 +1,39 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+from newspaper import Article
+from python.helpers.tool import Tool, Response
+
+class WebpageContentTool(Tool):
+    def execute(self, url="", **kwargs):
+        if not url:
+            return Response(message="Error: No URL provided.", break_loop=False)
+
+        try:
+            # Validate URL
+            parsed_url = urlparse(url)
+            if not all([parsed_url.scheme, parsed_url.netloc]):
+                return Response(message="Error: Invalid URL format.", break_loop=False)
+
+            # Fetch webpage content
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+
+            # Use newspaper3k for article extraction
+            article = Article(url)
+            article.download()
+            article.parse()
+
+            # If it's not an article, fall back to BeautifulSoup
+            if not article.text:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                text_content = ' '.join(soup.stripped_strings)
+            else:
+                text_content = article.text
+
+            return Response(message=f"Webpage content:\n\n{text_content}", break_loop=False)
+
+        except requests.RequestException as e:
+            return Response(message=f"Error fetching webpage: {str(e)}", break_loop=False)
+        except Exception as e:
+            return Response(message=f"An error occurred: {str(e)}", break_loop=False)
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,6 @@ sentence-transformers==3.0.1
 docker==7.1.0
 paramiko==3.4.0
 duckduckgo_search==6.1.12
-inputimeout==1.0.4
+inputimeout==1.0.4
+newspaper==0.1.0.7
+beautifulsoup4=4.12.3