Transformations exercises.

ijasanchez · ijasanchez · commit 896ab1eced67 · 2020-10-27T20:47:13.000-05:00
diff --git a/sds-rdd-apis/ColumnsNotebook.ipynb b/sds-rdd-apis/ColumnsNotebook.ipynb
diff --git a/sds-rdd-apis/LogFileDemo.py b/sds-rdd-apis/LogFileDemo.py
@@ -0,0 +1,30 @@
+from pyspark.sql import *
+from pyspark.sql.functions import regexp_extract, substring_index
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .master("local[3]") \
+        .appName("LogFileDemo") \
+        .getOrCreate()
+
+    file_df = spark.read.text("data/apache_logs.txt")
+    file_df.printSchema()
+
+
+    log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'
+
+    logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
+                             regexp_extract('value', log_reg, 4).alias('date'),
+                             regexp_extract('value', log_reg, 6).alias('request'),
+                             regexp_extract('value', log_reg, 10).alias('referrer'))
+
+    logs_df.printSchema()
+
+    logs_df \
+        .where("trim(referrer) != '-' ") \
+        .withColumn("referrer", substring_index("referrer", "/", 3)) \
+        .groupBy("referrer") \
+        .count() \
+        .show(100, truncate=False)
+
diff --git a/sds-rdd-apis/MyPythonNotebook.ipynb b/sds-rdd-apis/MyPythonNotebook.ipynb
@@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "source": [
+    "from pyspark.sql import *\n",
+    "from pyspark.sql.functions import *\n",
+    "from pyspark.sql.types import *\n",
+    "\n",
+    "def to_date_df(df, fmt, fld):\n",
+    "    return df.withColumn(fld, to_date(col(fld), fmt))\n"
+   ],
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "title": "",
+     "showTitle": false,
+     "inputWidgets": {},
+     "nuid": "80b42010-ef17-4d97-9ff1-16f7b96464d3"
+    }
+   },
+   "outputs": [
+    {
+     "output_type": "display_data",
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "datasetInfos": [],
+       "data": "<div class=\"ansiout\"></div>",
+       "removedWidgets": [],
+       "addedWidgets": {},
+       "type": "html",
+       "arguments": {}
+      }
+     },
+     "data": {
+      "text/html": [
+       "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
+      ]
+     }
+    }
+   ],
+   "execution_count": 0
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "my_schema = StructType([\n  StructField(\"ID\", StringType()),\n  StructField(\"EventDate\", StringType())])\n\nmy_rows = [Row(\"123\", \"04/05/2020\"), Row(\"124\", \"4/5/2020\"), Row(\"125\", \"04/5/2020\"), Row(\"126\", \"4/05/2020\")]\nmy_rdd = spark.sparkContext.parallelize(my_rows, 2)\nmy_df = spark.createDataFrame(my_rdd, my_schema)"
+   ],
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "title": "",
+     "showTitle": false,
+     "inputWidgets": {},
+     "nuid": "4ae5aa80-cd11-4e1a-905e-cf1406a6e1c8"
+    }
+   },
+   "outputs": [
+    {
+     "output_type": "display_data",
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "datasetInfos": [
+        {
+         "name": "my_df",
+         "typeStr": "pyspark.sql.dataframe.DataFrame",
+         "schema": {
+          "fields": [
+           {
+            "metadata": {},
+            "name": "ID",
+            "nullable": true,
+            "type": "string"
+           },
+           {
+            "metadata": {},
+            "name": "EventDate",
+            "nullable": true,
+            "type": "string"
+           }
+          ],
+          "type": "struct"
+         },
+         "tableIdentifier": null
+        }
+       ],
+       "data": "<div class=\"ansiout\"></div>",
+       "removedWidgets": [],
+       "addedWidgets": {},
+       "type": "html",
+       "arguments": {}
+      }
+     },
+     "data": {
+      "text/html": [
+       "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"
+      ]
+     }
+    }
+   ],
+   "execution_count": 0
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "my_df.printSchema()\nmy_df.show()\nnew_df = to_date_df(my_df,  \"M/d/y\", \"EventDate\")\nnew_df.printSchema()\nnew_df.show() "
+   ],
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "title": "",
+     "showTitle": false,
+     "inputWidgets": {},
+     "nuid": "457f165a-d68e-476a-8b32-31ed691770b2"
+    }
+   },
+   "outputs": [
+    {
+     "output_type": "display_data",
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "datasetInfos": [
+        {
+         "name": "new_df",
+         "typeStr": "pyspark.sql.dataframe.DataFrame",
+         "schema": {
+          "fields": [
+           {
+            "metadata": {},
+            "name": "ID",
+            "nullable": true,
+            "type": "string"
+           },
+           {
+            "metadata": {},
+            "name": "EventDate",
+            "nullable": true,
+            "type": "date"
+           }
+          ],
+          "type": "struct"
+         },
+         "tableIdentifier": null
+        }
+       ],
+       "data": "<div class=\"ansiout\">root\n |-- ID: string (nullable = true)\n |-- EventDate: string (nullable = true)\n\n+---+----------+\n| ID| EventDate|\n+---+----------+\n|123|04/05/2020|\n|124|  4/5/2020|\n|125| 04/5/2020|\n|126| 4/05/2020|\n+---+----------+\n\nroot\n |-- ID: string (nullable = true)\n |-- EventDate: date (nullable = true)\n\n+---+----------+\n| ID| EventDate|\n+---+----------+\n|123|2020-04-05|\n|124|2020-04-05|\n|125|2020-04-05|\n|126|2020-04-05|\n+---+----------+\n\n</div>",
+       "removedWidgets": [],
+       "addedWidgets": {},
+       "type": "html",
+       "arguments": {}
+      }
+     },
+     "data": {
+      "text/html": [
+       "<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">root\n-- ID: string (nullable = true)\n-- EventDate: string (nullable = true)\n\n+---+----------+\n ID| EventDate|\n+---+----------+\n123|04/05/2020|\n124|  4/5/2020|\n125| 04/5/2020|\n126| 4/05/2020|\n+---+----------+\n\nroot\n-- ID: string (nullable = true)\n-- EventDate: date (nullable = true)\n\n+---+----------+\n ID| EventDate|\n+---+----------+\n123|2020-04-05|\n124|2020-04-05|\n125|2020-04-05|\n126|2020-04-05|\n+---+----------+\n\n</div>"
+      ]
+     }
+    }
+   ],
+   "execution_count": 0
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "notebookName": "MyPythonNotebook",
+   "dashboards": [],
+   "language": "python",
+   "widgets": {},
+   "notebookOrigID": 343023503281504
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/sds-rdd-apis/RowDemo.py b/sds-rdd-apis/RowDemo.py
@@ -0,0 +1,33 @@
+from pyspark.sql import *
+from pyspark.sql.functions import *
+from pyspark.sql.types import *
+
+from lib.logger import Log4j
+
+
+def to_date_df(df, fmt, fld):
+    return df.withColumn(fld, to_date(fld, fmt))
+
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .master("local[3]") \
+        .appName("RowDemo") \
+        .getOrCreate()
+
+    logger = Log4j(spark)
+
+    my_schema = StructType([
+        StructField("ID", StringType()),
+        StructField("EventDate", StringType())])
+
+    my_rows = [Row("123", "04/05/2020"), Row("124", "4/5/2020"), Row("125", "04/5/2020"), Row("126", "4/05/2020")]
+    my_rdd = spark.sparkContext.parallelize(my_rows, 2)
+    my_df = spark.createDataFrame(my_rdd, my_schema)
+
+    my_df.printSchema()
+    my_df.show()
+    new_df = to_date_df(my_df, "M/d/y", "EventDate")
+    new_df.printSchema()
+    new_df.show()
diff --git a/sds-rdd-apis/RowDemo_Test.py b/sds-rdd-apis/RowDemo_Test.py
@@ -0,0 +1,35 @@
+from datetime import date
+from unittest import TestCase
+
+from pyspark.sql import *
+from pyspark.sql.types import *
+
+from RowDemo import to_date_df
+
+
+class RowDemoTestCase(TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.spark = SparkSession.builder \
+            .master("local[3]") \
+            .appName("RowDemoTest") \
+            .getOrCreate()
+
+        my_schema = StructType([
+            StructField("ID", StringType()),
+            StructField("EventDate", StringType())])
+
+        my_rows = [Row("123", "04/05/2020"), Row("124", "4/5/2020"), Row("125", "04/5/2020"), Row("126", "4/05/2020")]
+        my_rdd = cls.spark.sparkContext.parallelize(my_rows, 2)
+        cls.my_df = cls.spark.createDataFrame(my_rdd, my_schema)
+
+    def test_data_type(self):
+        rows = to_date_df(self.my_df, "M/d/y", "EventDate").collect()
+        for row in rows:
+            self.assertIsInstance(row["EventDate"], date)
+
+    def test_date_value(self):
+        rows = to_date_df(self.my_df, "M/d/y", "EventDate").collect()
+        for row in rows:
+            self.assertEqual(row["EventDate"], date(2020, 4, 5))
diff --git a/sds-rdd-apis/data/apache_logs.txt b/sds-rdd-apis/data/apache_logs.txt