added cross validation

sgaied · Jan 21, 2016 · 964282d · 964282d
1 parent ec06527
commit 964282d
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 107 deletions.
diff --git a/TCHUG_NLP_PYSPARK.pptx b/TCHUG_NLP_PYSPARK.pptx
diff --git a/extras/create_training_sets.py b/extras/create_training_sets.py
diff --git a/extras/hive/follower_timeline.sql b/extras/hive/follower_timeline.sql
diff --git a/extras/hive/follower_timeline_raw.sql b/extras/hive/follower_timeline_raw.sql
diff --git a/nlp_with_spark.ipynb b/nlp_with_spark.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -214,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -245,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -255,37 +255,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "+--------------------+-----+----------+\n|                text|label|prediction|\n+--------------------+-----+----------+\n|acolyte warmachin...|  3.0|       2.0|\n|agree need keep m...|  3.0|       1.0|\n|alright entourage...|  3.0|       0.0|\n|alternative thank...|  3.0|       2.0|\n|annual spring cle...|  3.0|       2.0|\n|anyone aware acad...|  3.0|       2.0|\n|api script servic...|  1.0|       0.0|\n|artificialintelli...|  1.0|       1.0|\n|available skorne ...|  3.0|       2.0|\n|avoid fine penalt...|  3.0|       2.0|\n|barton armed scho...|  3.0|       2.0|\n|based learning bi...|  1.0|       1.0|\n|bedfordshire poli...|  3.0|       2.0|\n|beginning week ja...|  3.0|       0.0|\n|being new company...|  3.0|       2.0|\n|best part econ pa...|  3.0|       2.0|\n|best photo human ...|  3.0|       2.0|\n|better bulldog bu...|  3.0|       2.0|\n|better throw same...|  3.0|       2.0|\n|big data essentia...|  3.0|       1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
+      "+--------------------+-----+----------+\n|                text|label|prediction|\n+--------------------+-----+----------+\n|big boy toy drone...|  0.0|       1.0|\n|big data result d...|  0.0|       1.0|\n|big data service ...|  0.0|       0.0|\n|       big data wild|  0.0|       0.0|\n|bigdata algorithm...|  0.0|       0.0|\n|bigdata analytics...|  0.0|       0.0|\n|cognitive technol...|  0.0|       0.0|\n|company fight rec...|  0.0|       1.0|\n|data asset inconv...|  0.0|       0.0|\n|data lover kirk b...|  0.0|       0.0|\n|data science hunc...|  0.0|       0.0|\n|datascientists ne...|  0.0|       0.0|\n|facebook data sci...|  0.0|       1.0|\n|foodindustry plan...|  0.0|       1.0|\n|future datascienc...|  0.0|       0.0|\n|gonzalezcarmen th...|  0.0|       0.0|\n|important editori...|  0.0|       0.0|\n|intel doubling co...|  0.0|       0.0|\n|iot analytics edg...|  0.0|       0.0|\n|irish start ups m...|  0.0|       1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
      ]
     }
    ],
    "source": [
-    "prediction_df.show()"
+    "datasci_df = prediction_df.filter(prediction_df['label']==0.0)\n",
+    "datasci_df.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "+--------------------+-----+----------+\n|                text|label|prediction|\n+--------------------+-----+----------+\n|multitun tunnel a...|  0.0|       0.0|\n|phdfootball scrip...|  0.0|       2.0|\n|python pic learn ...|  0.0|       2.0|\n|redis python modu...|  0.0|       0.0|\n|starterlearningpy...|  0.0|       2.0|\n|survival analysis...|  0.0|       1.0|\n|teleport json typ...|  0.0|       2.0|\n|command line tool...|  0.0|       0.0|\n|tckimlik personal...|  0.0|       0.0|\n|more tool data vi...|  0.0|       1.0|\n|python script tir...|  0.0|       2.0|\n|thirtylol lol src...|  0.0|       2.0|\n|good read impleme...|  0.0|       1.0|\n|html validator te...|  0.0|       2.0|\n|article tune regu...|  0.0|       0.0|\n|chainer dcgan cha...|  0.0|       2.0|\n|credstash utility...|  0.0|       2.0|\n|bot data analysis...|  0.0|       0.0|\n|data migration bl...|  0.0|       0.0|\n|great guide data ...|  0.0|       0.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
+      "+--------------------+-----+----------+\n|                text|label|prediction|\n+--------------------+-----+----------+\n|acolyte warmachin...|  1.0|       1.0|\n|alfred producthun...|  1.0|       0.0|\n|alternative thank...|  1.0|       1.0|\n|america greatest ...|  1.0|       1.0|\n|animatic hell lot...|  1.0|       1.0|\n|annual spring cle...|  1.0|       1.0|\n|anyone think secu...|  1.0|       1.0|\n|are looking someo...|  1.0|       1.0|\n|avoid fine penalt...|  1.0|       1.0|\n|bad news toe brin...|  1.0|       1.0|\n|barton armed scho...|  1.0|       1.0|\n|basic classificat...|  1.0|       1.0|\n|bedfordshire poli...|  1.0|       1.0|\n|beginning week ja...|  1.0|       1.0|\n|being new company...|  1.0|       1.0|\n|best part econ pa...|  1.0|       1.0|\n|best photo human ...|  1.0|       1.0|\n|better bulldog bu...|  1.0|       1.0|\n|better throw same...|  1.0|       1.0|\n|big idea disrupt ...|  1.0|       1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
      ]
     }
    ],
    "source": [
-    "datasci_df = prediction_df.filter(prediction_df['label']==0.0)\n",
-    "datasci_df.show()"
+    "ao_df = prediction_df.filter(prediction_df['label']==1.0)\n",
+    "ao_df.show()"
    ]
   },
   {
@@ -294,7 +295,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    ""
+    "# TODO Add join back to original text\n",
+    "# TODO fix raw_classification labels\n",
+    "# TODO show accuracy measures"
    ]
   }
  ],

diff --git a/preproc.py b/preproc.py
@@ -122,7 +122,8 @@ def lemmatize(data_str):
         else:
             cleaned_str = cleaned_str + ' ' + lemma
         list_pos += 1
-    return cleaned_str
+    trimmed_str = cleaned_str.trim()
+    return trimmed_str
 
 
 # check to see if a row only contains whitespace
@@ -134,12 +135,10 @@ def check_blanks(data_str):
 # convert the text label into a numeric one
 def numeric_label(data_str):
     lower_str = data_str.lower()
-    if lower_str == 'python':
+    if lower_str == 'datasci':
         label = 0.0
-    elif lower_str == 'hadoop':
+    elif lower_str == 'wargaming':
         label = 1.0
-    elif lower_str == 'datasci':
-        label = 2.0
     else:
         label = 3.0
     return label