Skip to content

Commit

Permalink
Update crypto newsletter substack to RSS. fixes OpenBB-finance#4496 (O…
Browse files Browse the repository at this point in the history
…penBB-finance#4497)

Co-authored-by: James Maslek <[email protected]>
  • Loading branch information
jfarid27 and jmaslek authored Mar 21, 2023
1 parent d671884 commit a43e414
Show file tree
Hide file tree
Showing 5 changed files with 29,556 additions and 4,699 deletions.
38 changes: 18 additions & 20 deletions openbb_terminal/cryptocurrency/defi/substack_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@


@log_start_end(log=logger)
def scrape_substack(url: str) -> List[List[str]]:
"""Helper method to scrape newsletters from substack.
def scrape_substack_rss(url: str, limit: int = 10) -> List[List[str]]:
"""Helper method to scrape newsletters from a substack rss feed.
[Source: substack.com]
Parameters
Expand All @@ -37,17 +37,16 @@ def scrape_substack(url: str) -> List[List[str]]:
"""

req = request(url)
soup = BeautifulSoup(req.text, features="lxml")
soup = BeautifulSoup(req.text, features="xml")
results: List[List[str]] = []
archive = soup.find("div", class_="portable-archive-list")
if archive:
posts = archive.find_all(
"div", class_="post-preview portable-archive-post has-image has-author-line"
)
rss = soup.find("rss")
if rss:
posts = rss.find_all("item")[:limit]
for post in posts:
title: str = post.a.text
post_url: str = post.a["href"]
time: str = post.find("time").get("datetime")
title: str = post.title.text
post_url: str = post.link.text
time_str = post.pubDate.text.split(" (")[0]
time: str = time_str
results.append([title, post_url, time])
return results

Expand All @@ -63,19 +62,18 @@ def get_newsletters() -> pd.DataFrame:
DataFrame with recent news from most popular DeFi related newsletters.
"""

urls = [
"https://defiweekly.substack.com/archive",
"https://newsletter.thedefiant.io/archive",
"https://thedailygwei.substack.com/archive",
"https://todayindefi.substack.com/archive",
"https://newsletter.banklesshq.com/archive",
"https://defislate.substack.com/archive",
urls_rss = [
"https://kermankohli.substack.com/feed",
"https://thedefiant.io/api/feed",
"https://thedailygwei.substack.com/feed",
"https://todayindefi.substack.com/feed",
"https://defislate.substack.com/feed",
]

threads = len(urls)
threads = len(urls_rss)
newsletters = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
for newsletter in executor.map(scrape_substack, urls):
for newsletter in executor.map(scrape_substack_rss, urls_rss):
try:
newsletters.append(pd.DataFrame(newsletter))
except KeyError as e:
Expand Down
Loading

0 comments on commit a43e414

Please sign in to comment.