Skip to content

Commit

Permalink
Merge pull request geekan#69 from shenchucheng/main
Browse files Browse the repository at this point in the history
Add web page scraping feature implemented by Playwright/Selenium
  • Loading branch information
geekan authored Jul 25, 2023
2 parents a538f9a + e44410b commit 007c8c0
Show file tree
Hide file tree
Showing 13 changed files with 479 additions and 30 deletions.
12 changes: 12 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#OPENAI_API_KEY: "YOUR_API_KEY"
#OPENAI_API_BASE: "YOUR_API_BASE"
#OPENAI_PROXY: "http://127.0.0.1:8118"
OPENAI_API_MODEL: "gpt-4"
MAX_TOKENS: 1500
RPM: 10
Expand All @@ -31,6 +32,17 @@ RPM: 10
## Visit https://serper.dev/ to get key.
#SERPER_API_KEY: "YOUR_API_KEY"

#### for web access

## Supported values: playwright/selenium
#WEB_BROWSER_ENGINE: playwright

## Supported values: chromium/firefox/webkit, visit https://playwright.dev/python/docs/api/class-browsertype
##PLAYWRIGHT_BROWSER_TYPE: chromium

## Supported values: chrome/firefox/edge/ie, visit https://www.selenium.dev/documentation/webdriver/browsers/
# SELENIUM_BROWSER_TYPE: chrome

#### for TTS

#AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY"
Expand Down
4 changes: 2 additions & 2 deletions docs/ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ To reach version v0.5, approximately 70% of the following tasks need to be compl
5. Plugins: Compatibility with plugin system
6. Tools
1. ~~Support SERPER api~~
2. Support Selenium apis
3. Support Playwright apis
2. ~~Support Selenium apis~~
3. ~~Support Playwright apis~~
7. Roles
1. Perfect the action pool/skill pool for each role
2. Red Book blogger
Expand Down
60 changes: 35 additions & 25 deletions metagpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
提供配置,单例
"""
import os
import openai

import yaml

from metagpt.const import PROJECT_ROOT
from metagpt.logs import logger
from metagpt.tools import SearchEngineType
from metagpt.utils.singleton import Singleton
from metagpt.tools import SearchEngineType, WebBrowserEngineType


class NotConfiguredException(Exception):
Expand All @@ -32,40 +33,49 @@ class Config(metaclass=Singleton):
secret_key = config.get_key("MY_SECRET_KEY")
print("Secret key:", secret_key)
"""

_instance = None
key_yaml_file = PROJECT_ROOT / 'config/key.yaml'
default_yaml_file = PROJECT_ROOT / 'config/config.yaml'
key_yaml_file = PROJECT_ROOT / "config/key.yaml"
default_yaml_file = PROJECT_ROOT / "config/config.yaml"

def __init__(self, yaml_file=default_yaml_file):
self._configs = {}
self._init_with_config_files_and_env(self._configs, yaml_file)
logger.info('Config loading done.')
self.openai_api_key = self._get('OPENAI_API_KEY')
if not self.openai_api_key or 'YOUR_API_KEY' == self.openai_api_key:
logger.info("Config loading done.")
self.global_proxy = self._get("GLOBAL_PROXY")
self.openai_api_key = self._get("OPENAI_API_KEY")
if not self.openai_api_key or "YOUR_API_KEY" == self.openai_api_key:
raise NotConfiguredException("Set OPENAI_API_KEY first")
self.openai_api_base = self._get('OPENAI_API_BASE')
if not self.openai_api_base or 'YOUR_API_BASE' == self.openai_api_base:
logger.info("Set OPENAI_API_BASE in case of network issues")
self.openai_api_type = self._get('OPENAI_API_TYPE')
self.openai_api_version = self._get('OPENAI_API_VERSION')
self.openai_api_rpm = self._get('RPM', 3)
self.openai_api_model = self._get('OPENAI_API_MODEL', "gpt-4")
self.max_tokens_rsp = self._get('MAX_TOKENS', 2048)
self.deployment_id = self._get('DEPLOYMENT_ID')

self.claude_api_key = self._get('Anthropic_API_KEY')

self.serpapi_api_key = self._get('SERPAPI_API_KEY')
self.serper_api_key = self._get('SERPER_API_KEY')
self.google_api_key = self._get('GOOGLE_API_KEY')
self.google_cse_id = self._get('GOOGLE_CSE_ID')
self.search_engine = self._get('SEARCH_ENGINE', SearchEngineType.SERPAPI_GOOGLE)
self.openai_api_base = self._get("OPENAI_API_BASE")
if not self.openai_api_base or "YOUR_API_BASE" == self.openai_api_base:
openai_proxy = self._get("OPENAI_PROXY") or self.global_proxy
if openai_proxy:
openai.proxy = openai_proxy
else:
logger.info("Set OPENAI_API_BASE in case of network issues")
self.openai_api_type = self._get("OPENAI_API_TYPE")
self.openai_api_version = self._get("OPENAI_API_VERSION")
self.openai_api_rpm = self._get("RPM", 3)
self.openai_api_model = self._get("OPENAI_API_MODEL", "gpt-4")
self.max_tokens_rsp = self._get("MAX_TOKENS", 2048)
self.deployment_id = self._get("DEPLOYMENT_ID")

self.claude_api_key = self._get('Anthropic_API_KEY')
self.serpapi_api_key = self._get("SERPAPI_API_KEY")
self.serper_api_key = self._get("SERPER_API_KEY")
self.google_api_key = self._get("GOOGLE_API_KEY")
self.google_cse_id = self._get("GOOGLE_CSE_ID")
self.search_engine = self._get("SEARCH_ENGINE", SearchEngineType.SERPAPI_GOOGLE)

self.web_browser_engine = WebBrowserEngineType(self._get("WEB_BROWSER_ENGINE", "playwright"))
self.playwright_browser_type = self._get("PLAYWRIGHT_BROWSER_TYPE", "chromium")
self.selenium_browser_type = self._get("SELENIUM_BROWSER_TYPE", "chrome")

self.long_term_memory = self._get('LONG_TERM_MEMORY', False)
if self.long_term_memory:
logger.warning("LONG_TERM_MEMORY is True")

self.max_budget = self._get('MAX_BUDGET', 10.0)
self.max_budget = self._get("MAX_BUDGET", 10.0)
self.total_cost = 0.0

def _init_with_config_files_and_env(self, configs: dict, yaml_file):
Expand All @@ -77,7 +87,7 @@ def _init_with_config_files_and_env(self, configs: dict, yaml_file):
continue

# 加载本地 YAML 文件
with open(_yaml_file, 'r', encoding="utf-8") as file:
with open(_yaml_file, "r", encoding="utf-8") as file:
yaml_data = yaml.safe_load(file)
if not yaml_data:
continue
Expand Down
6 changes: 6 additions & 0 deletions metagpt/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ class SearchEngineType(Enum):
DIRECT_GOOGLE = auto()
SERPER_GOOGLE = auto()
CUSTOM_ENGINE = auto()


class WebBrowserEngineType(Enum):
PLAYWRIGHT = "playwright"
SELENIUM = "selenium"
CUSTOM = "custom"
59 changes: 59 additions & 0 deletions metagpt/tools/web_browser_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python

from __future__ import annotations
import asyncio
import importlib

from typing import Any, Callable, Coroutine, overload

from metagpt.config import CONFIG
from metagpt.tools import WebBrowserEngineType
from bs4 import BeautifulSoup


class WebBrowserEngine:
def __init__(
self,
engine: WebBrowserEngineType | None = None,
run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None,
parse_func: Callable[[str], str] | None = None,
):
engine = engine or CONFIG.web_browser_engine

if engine == WebBrowserEngineType.PLAYWRIGHT:
module = "metagpt.tools.web_browser_engine_playwright"
run_func = importlib.import_module(module).PlaywrightWrapper().run
elif engine == WebBrowserEngineType.SELENIUM:
module = "metagpt.tools.web_browser_engine_selenium"
run_func = importlib.import_module(module).SeleniumWrapper().run
elif engine == WebBrowserEngineType.CUSTOM:
run_func = run_func
else:
raise NotImplementedError
self.parse_func = parse_func or get_page_content
self.run_func = run_func
self.engine = engine

@overload
async def run(self, url: str) -> str:
...

@overload
async def run(self, url: str, *urls: str) -> list[str]:
...

async def run(self, url: str, *urls: str) -> str | list[str]:
page = await self.run_func(url, *urls)
if isinstance(page, str):
return self.parse_func(page)
return [self.parse_func(i) for i in page]


def get_page_content(page: str):
soup = BeautifulSoup(page, "html.parser")
return "\n".join(i.text.strip() for i in soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre"]))


if __name__ == "__main__":
text = asyncio.run(WebBrowserEngine().run("https://fuzhi.ai/"))
print(text)
121 changes: 121 additions & 0 deletions metagpt/tools/web_browser_engine_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python
from __future__ import annotations

import asyncio
from pathlib import Path
import sys
from typing import Literal
from playwright.async_api import async_playwright
from metagpt.config import CONFIG
from metagpt.logs import logger


class PlaywrightWrapper:
"""Wrapper around Playwright.
To use this module, you should have the `playwright` Python package installed and ensure that
the required browsers are also installed. You can install playwright by running the command
`pip install metagpt[playwright]` and download the necessary browser binaries by running the
command `playwright install` for the first time."
"""

def __init__(
self,
browser_type: Literal["chromium", "firefox", "webkit"] | None = None,
launch_kwargs: dict | None = None,
**kwargs,
) -> None:
if browser_type is None:
browser_type = CONFIG.playwright_browser_type
self.browser_type = browser_type
launch_kwargs = launch_kwargs or {}
if CONFIG.global_proxy and "proxy" not in launch_kwargs:
args = launch_kwargs.get("args", [])
if not any(str.startswith(i, "--proxy-server=") for i in args):
launch_kwargs["proxy"] = {"server": CONFIG.global_proxy}
self.launch_kwargs = launch_kwargs
context_kwargs = {}
if "ignore_https_errors" in kwargs:
context_kwargs["ignore_https_errors"] = kwargs["ignore_https_errors"]
self._context_kwargs = context_kwargs
self._has_run_precheck = False

async def run(self, url: str, *urls: str) -> str | list[str]:
async with async_playwright() as ap:
browser_type = getattr(ap, self.browser_type)
await self._run_precheck(browser_type)
browser = await browser_type.launch(**self.launch_kwargs)

async def _scrape(url):
context = await browser.new_context(**self._context_kwargs)
page = await context.new_page()
async with page:
try:
await page.goto(url)
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
content = await page.content()
return content
except Exception as e:
return f"Fail to load page content for {e}"

if urls:
return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls))
return await _scrape(url)

async def _run_precheck(self, browser_type):
if self._has_run_precheck:
return

executable_path = Path(browser_type.executable_path)
if not executable_path.exists() and "executable_path" not in self.launch_kwargs:
kwargs = {}
if CONFIG.global_proxy:
kwargs["env"] = {"ALL_PROXY": CONFIG.global_proxy}
await _install_browsers(self.browser_type, **kwargs)
if not executable_path.exists():
parts = executable_path.parts
available_paths = list(Path(*parts[:-3]).glob(f"{self.browser_type}-*"))
if available_paths:
logger.warning(
"It seems that your OS is not officially supported by Playwright. "
"Try to set executable_path to the fallback build version."
)
executable_path = available_paths[0].joinpath(*parts[-2:])
self.launch_kwargs["executable_path"] = str(executable_path)
self._has_run_precheck = True


async def _install_browsers(*browsers, **kwargs) -> None:
process = await asyncio.create_subprocess_exec(
sys.executable,
"-m",
"playwright",
"install",
*browsers,
"--with-deps",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
**kwargs,
)

await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))

if await process.wait() == 0:
logger.info(f"Install browser for playwright successfully.")
else:
logger.warning(f"Fail to install browser for playwright.")


async def _log_stream(sr, log_func):
while True:
line = await sr.readline()
if not line:
return
log_func(f"[playwright install browser]: {line.decode().strip()}")


if __name__ == "__main__":
for i in ("chromium", "firefox", "webkit"):
text = asyncio.run(PlaywrightWrapper(i).run("https://httpbin.org/ip"))
print(text)
print(i)
Loading

0 comments on commit 007c8c0

Please sign in to comment.