-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreducer.py
executable file
·34 lines (27 loc) · 1.19 KB
/
reducer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
"""A more advanced Reducer, using Python iterators and generators."""
from itertools import groupby
from operator import itemgetter
import sys
def read_mapper_output(file, separator='\t'):
for line in file:
# print ("line: ", line)
yield line.rstrip().split(separator, 1)
def main(separator='\t'):
# input comes from STDIN (standard input)
data = read_mapper_output(sys.stdin, separator=separator)
data = sorted(data, key=itemgetter(0))
# groupby groups multiple word-count pairs by word,
# and creates an iterator that returns consecutive keys and their group:
# current_word - string containing a word (the key)
# group - iterator yielding all ["<current_word>", "<count>"] items
for current_word, group in groupby(data, itemgetter(0)):
try:
# print ("current_word: ", current_word, " group: ", group)
total_count = sum(int(count) for current_word, count in group)
print("%s%s%d" % (current_word, separator, total_count))
except ValueError:
# count was not a number, so silently discard this item
pass
if __name__ == "__main__":
main()