Added pairRdd/groupbykey/*.py

Pedro Bernardo · Pedro Bernardo · commit d6b58da2436d · 2017-10-02T12:57:12.000+02:00
diff --git a/pairRdd/groupbykey/AirportsByCountryProblem.py b/pairRdd/groupbykey/AirportsByCountryProblem.py
@@ -0,0 +1,23 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text,
+    output the the list of the names of the airports located in each country.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+
+    "Canada", ["Bagotville", "Montreal", "Coronation", ...]
+    "Norway" : ["Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..]
+    "Papua New Guinea",  ["Goroka", "Madang", ...]
+    ...
+
+    '''
+
+
+     
diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.py b/pairRdd/groupbykey/AirportsByCountrySolution.py
@@ -0,0 +1,18 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "airports")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/airports.text")
+
+    countryAndAirportNameAndPair = lines.map(lambda airport:\
+         (Utils.COMMA_DELIMITER.split(airport)[3],
+          Utils.COMMA_DELIMITER.split(airport)[1]))
+
+    airportsByCountry = countryAndAirportNameAndPair.groupByKey()
+
+    for country, airportName in airportsByCountry.collectAsMap().items():
+        print("{}: {}".format(country,list(airportName)))
diff --git a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py
@@ -0,0 +1,18 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "GroupByKeyVsReduceByKey")
+    sc.setLogLevel("ERROR")
+
+    words = ["one", "two", "two", "three", "three", "three"]
+    wordsPairRdd = sc.parallelize(words).map(lambda word: (word, 1))
+
+    wordCountsWithReduceByKey = wordsPairRdd.reduceByKey(lambda x, y: x + y).collect()
+    print("wordCountsWithReduceByKey: {}".format(list(wordCountsWithReduceByKey)))
+
+    wordCountsWithGroupByKey = wordsPairRdd \
+        .groupByKey() \
+        .mapValues(lambda intIterable: len(intIterable)) \
+        .collect()
+    print("wordCountsWithGroupByKey: {}".format(list(wordCountsWithGroupByKey)))