Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Uses Beautiful Soup for web scraping, PyTorch for image segmentation, and Wordcloud to create the final product.
  • Loading branch information
dtretiak authored Sep 26, 2019
1 parent 67dd3c2 commit 4dbdd51
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions ScrapedWordCloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import bs4
import matplotlib.pyplot as plt
import wordcloud
import requests
import numpy

url = input("Paste a wikipedia url here:")

#download html
req = requests.get(url)
req.raise_for_status()

#create soup object and use CSS selectors to get element
wikiSoup = bs4.BeautifulSoup(req.text, features="lxml")
text_elem = wikiSoup.select('p')

#iterate through text element and combine paragraphs
full_text = []
for par in text_elem:
full_text.append(par.getText().replace('\n', '').replace('\xa0', ' '))
full_text = ''.join(full_text)

#create and save wordcloud
wikiCloud = wordcloud.WordCloud().generate(full_text)
plt.imshow(wikiCloud)
plt.show()
plt.savefig('wikiCloud.png')

0 comments on commit 4dbdd51

Please sign in to comment.