-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_count.py
89 lines (64 loc) · 2.73 KB
/
word_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Author: Christos Aleiferis
#
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[a-zA-Z]+")
class MRWordFrequencyCount(MRJob):
# the values the reducer receives are sorted
SORT_VALUES = True
def mapper(self, _, line):
words_array = WORD_RE.findall(line)
for word1 in words_array:
word_dict = {}
# to not get out of bounds of line
if words_array.index(word1)+1 < len(words_array):
for word2 in words_array:
if words_array.index(word1)+1 < len(words_array):
if word1 == word2:
next_word = words_array[words_array.index(word1)+1].lower()
# if next_word seen before aggregate
if next_word in word_dict:
word_dict[next_word] += 1
# if next_word appears first time add new entry to the dictionary
else:
word_dict.update({next_word:1})
yield word1.lower(), word_dict
def reducer(self, word, stripes):
new_dict = {}
for i in stripes:
# for every stripe aggregate the values
for k, v in i.items():
new_dict[k] = new_dict.get(k, 0) + v
yield word, new_dict
def relative_freq(self, word, stripes):
# compute the total frequencies
total = 0
# dictionary associated with one word
relative_freq = {}
for i in stripes:
total += sum(i.values())
#print(i.values(),total)
for k, v in i.items():
relative_freq[k] = relative_freq.get(k, 0) + float(v)/float(total)
#print(word,relative_freq)
yield word, relative_freq
def ten_most_pop_words(self, word, stripes):
# in this program search 10 words most likely appeared after "my"
# for different searches change accordingly
top_ten = {}
for i in stripes:
for k, v in i.items():
top_ten[k] = top_ten.get(k,0) + v
top_ten_list = sorted(top_ten, key = top_ten.__getitem__, reverse = True)[:10]
if (word == "my"):
print(word, top_ten_list)
def steps(self):
return [
MRStep(mapper=self.mapper,
reducer=self.reducer),
MRStep(reducer=self.relative_freq),
MRStep(reducer=self.ten_most_pop_words)
]
if __name__ == '__main__':
MRWordFrequencyCount.run()