Added pairRdd/aggregation/reducebykey/WordCount.py

Pedro Bernardo · Pedro Bernardo · commit 43f7883c0372 · 2017-10-02T13:16:18.000+02:00
diff --git a/pairRdd/aggregation/reducebykey/WordCount.py b/pairRdd/aggregation/reducebykey/WordCount.py
@@ -0,0 +1,14 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "wordCounts")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/word_count.text")
+    wordRdd = lines.flatMap(lambda line: line.split(" "))
+    wordPairRdd = wordRdd.map(lambda word: (word, 1))
+
+    wordCounts = wordPairRdd.reduceByKey(lambda x, y: x + y)
+    for word, count in wordCounts.collect():
+        print("{} : {}".format(word, count))