Skip to content

Commit

Permalink
Merge pull request mongodb#42 from mpobrien/master
Browse files Browse the repository at this point in the history
Sample mapreduce job - enron email corpus
  • Loading branch information
Brendan W. McAdams committed Apr 9, 2012
2 parents fb9b345 + bc20c01 commit 66c9948
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 0 deletions.
20 changes: 20 additions & 0 deletions streaming/examples/enron/enron_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python

import sys
sys.path.append(".")

from pymongo_hadoop import BSONMapper

def mapper(documents):
i = 0
for doc in documents:
i = i + 1
if 'headers' in doc and 'To' in doc['headers'] and 'From' in doc['headers']:
from_field = doc['headers']['From']
to_field = doc['headers']['To']
recips = [x.strip() for x in to_field.split(',')]
for r in recips:
yield {'_id': {'f':from_field, 't':r}, 'count': 1}

BSONMapper(mapper)
print >> sys.stderr, "Done Mapping."
15 changes: 15 additions & 0 deletions streaming/examples/enron/enron_reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python

import sys
sys.path.append(".")

from pymongo_hadoop import BSONReducer

def reducer(key, values):
print >> sys.stderr, "Processing from/to %s" % str(key)
_count = 0
for v in values:
_count += v['count']
return {'_id': key, 'count': _count}

BSONReducer(reducer)
1 change: 1 addition & 0 deletions streaming/examples/enron/run_enron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.py -reducer examples/enron/enron_reduce.py -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file examples/enron/enron_map.py -file examples/enron/enron_reduce.py

0 comments on commit 66c9948

Please sign in to comment.