-
Notifications
You must be signed in to change notification settings - Fork 21
/
scrape.py
73 lines (59 loc) · 2.45 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
try:
import praw
except ImportError:
print "Unable to find praw. see https://github.com/praw-dev/praw"
raise
from time import sleep
from urllib import urlopen
import os
import datetime
now = datetime.datetime.now
from sanitize import sanitize
def save_to(url, filename):
data = urlopen(url).read()
with open(filename, mode='w') as output:
output.write(data)
def main():
settings = eval(open('settings.txt').read())
user_agent = settings['user_agent']
image_extensions = settings['extensions']
subreddits = settings['subreddits']
alert = lambda s: print(s) if settings['verbose'] else None
alert("Beginning web scrape.")
alert('Getting user agent...')
r = praw.Reddit(user_agent=user_agent)
alert('Got user agent.')
total_n = sum(len(v) for v in SUBREDDITS.values())
n_so_far = 0.
for base_dir, reddits_for_dir in subreddits.iteritems():
for reddit in reddits_for_dir:
subreddit_dir = os.path.join(base_dir, reddit)
if not os.path.exists(subreddit_dir):
alert('Making %s' % subreddit_dir)
os.makedirs(subreddit_dir)
## The API call
submissions = r.get_subreddit(reddit).get_top_from_day(limit=5)
alert('/r/%s' % reddit)
for sub in submissions:
url = sub.url
alert('url is %s' % url)
if any(sub.url.lower().endswith(ext.lower()) for ext in IMAGE_EXTENSIONS):
alert('Found a picture.')
votes = '%s|%s' % (sub.ups, sub.downs)
extension = url.split('.')[-1]
alert('Extension is %s' % extension)
title = sanitize(sub.title)
if title.endswith('.'): title = title[:-1]
local_filename = os.path.join(subreddit_dir, '%s.%s' % (title, extension))
alert('Saving to %s' % local_filename)
with open(os.path.join(base_dir, 'update.log'), 'a') as output:
print >> output, '%s|%s|%s|%s' % (now(), local_filename, votes, url)
save_to(url, local_filename)
print
n_so_far += 1
percent = int((100 * n_so_far) / total_n)
alert("%d percent complete." % percent)
sleep(2.5) # Avoid offending the Reddit API Gods!)
alert("Completed web scrape.")
if __name__ == '__main__':
main()