-
Notifications
You must be signed in to change notification settings - Fork 84
/
Copy pathanalyze_subreddit_titles.py
50 lines (43 loc) · 1.36 KB
/
analyze_subreddit_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
Analyze word frequency in subreddit post titles
Outputs the number of times a word is seen and orders them
by the most used words.
Quick and dirty script that is not optimized
www.devdungeon.com
"""
min_word_length = 5 # Ignore words shorter than this length
min_word_frequency = 4 # Only show words seen at least this many times
subreddit = 'houston'
request_limit = 1000 # Max 1000
import operator
import praw
import re
user_agent = ("Houston Subreddit Analyzer/0.1 by [email protected]")
reddit = praw.Reddit(user_agent=user_agent)
wordbank = dict()
count = 1
posts = reddit.get_subreddit(subreddit).get_hot(limit=request_limit)
for post in posts:
count += 1
title = str(post).split(' :: ')[1] # First value is upvotes followed by ::
words = title.split()
for word in words:
# Strip special chars
word = re.sub('[^a-zA-Z0-9]', '', word)
# Ignore anything less than 4 chars
if len(word) < min_word_length:
continue
# Lowercase to normalize
word = word.lower()
if not word in wordbank:
wordbank[word] = 1
else:
wordbank[word] += 1
sorted_words = sorted(wordbank.items(), key=operator.itemgetter(1))
sorted_words.reverse()
print "Processed " + repr(count) + " post titles."
for word in sorted_words:
# Ignore word that were seen less than X times
if word[1] > min_word_frequency - 1:
print repr(word[1]) + " " + word[0]