-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Uses Beautiful Soup for web scraping, PyTorch for image segmentation, and Wordcloud to create the final product.
- Loading branch information
Showing
1 changed file
with
27 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import bs4 | ||
import matplotlib.pyplot as plt | ||
import wordcloud | ||
import requests | ||
import numpy | ||
|
||
url = input("Paste a wikipedia url here:") | ||
|
||
#download html | ||
req = requests.get(url) | ||
req.raise_for_status() | ||
|
||
#create soup object and use CSS selectors to get element | ||
wikiSoup = bs4.BeautifulSoup(req.text, features="lxml") | ||
text_elem = wikiSoup.select('p') | ||
|
||
#iterate through text element and combine paragraphs | ||
full_text = [] | ||
for par in text_elem: | ||
full_text.append(par.getText().replace('\n', '').replace('\xa0', ' ')) | ||
full_text = ''.join(full_text) | ||
|
||
#create and save wordcloud | ||
wikiCloud = wordcloud.WordCloud().generate(full_text) | ||
plt.imshow(wikiCloud) | ||
plt.show() | ||
plt.savefig('wikiCloud.png') |