Skip to content

Commit

Permalink
added cross validation
Browse files Browse the repository at this point in the history
  • Loading branch information
dreyco676 committed Jan 21, 2016
1 parent ec06527 commit 964282d
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 107 deletions.
Binary file modified TCHUG_NLP_PYSPARK.pptx
Binary file not shown.
71 changes: 0 additions & 71 deletions extras/create_training_sets.py

This file was deleted.

17 changes: 0 additions & 17 deletions extras/hive/follower_timeline.sql

This file was deleted.

2 changes: 0 additions & 2 deletions extras/hive/follower_timeline_raw.sql

This file was deleted.

27 changes: 15 additions & 12 deletions nlp_with_spark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -189,7 +189,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -214,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -245,7 +245,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -255,37 +255,38 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----+----------+\n| text|label|prediction|\n+--------------------+-----+----------+\n|acolyte warmachin...| 3.0| 2.0|\n|agree need keep m...| 3.0| 1.0|\n|alright entourage...| 3.0| 0.0|\n|alternative thank...| 3.0| 2.0|\n|annual spring cle...| 3.0| 2.0|\n|anyone aware acad...| 3.0| 2.0|\n|api script servic...| 1.0| 0.0|\n|artificialintelli...| 1.0| 1.0|\n|available skorne ...| 3.0| 2.0|\n|avoid fine penalt...| 3.0| 2.0|\n|barton armed scho...| 3.0| 2.0|\n|based learning bi...| 1.0| 1.0|\n|bedfordshire poli...| 3.0| 2.0|\n|beginning week ja...| 3.0| 0.0|\n|being new company...| 3.0| 2.0|\n|best part econ pa...| 3.0| 2.0|\n|best photo human ...| 3.0| 2.0|\n|better bulldog bu...| 3.0| 2.0|\n|better throw same...| 3.0| 2.0|\n|big data essentia...| 3.0| 1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
"+--------------------+-----+----------+\n| text|label|prediction|\n+--------------------+-----+----------+\n|big boy toy drone...| 0.0| 1.0|\n|big data result d...| 0.0| 1.0|\n|big data service ...| 0.0| 0.0|\n| big data wild| 0.0| 0.0|\n|bigdata algorithm...| 0.0| 0.0|\n|bigdata analytics...| 0.0| 0.0|\n|cognitive technol...| 0.0| 0.0|\n|company fight rec...| 0.0| 1.0|\n|data asset inconv...| 0.0| 0.0|\n|data lover kirk b...| 0.0| 0.0|\n|data science hunc...| 0.0| 0.0|\n|datascientists ne...| 0.0| 0.0|\n|facebook data sci...| 0.0| 1.0|\n|foodindustry plan...| 0.0| 1.0|\n|future datascienc...| 0.0| 0.0|\n|gonzalezcarmen th...| 0.0| 0.0|\n|important editori...| 0.0| 0.0|\n|intel doubling co...| 0.0| 0.0|\n|iot analytics edg...| 0.0| 0.0|\n|irish start ups m...| 0.0| 1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
]
}
],
"source": [
"prediction_df.show()"
"datasci_df = prediction_df.filter(prediction_df['label']==0.0)\n",
"datasci_df.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----+----------+\n| text|label|prediction|\n+--------------------+-----+----------+\n|multitun tunnel a...| 0.0| 0.0|\n|phdfootball scrip...| 0.0| 2.0|\n|python pic learn ...| 0.0| 2.0|\n|redis python modu...| 0.0| 0.0|\n|starterlearningpy...| 0.0| 2.0|\n|survival analysis...| 0.0| 1.0|\n|teleport json typ...| 0.0| 2.0|\n|command line tool...| 0.0| 0.0|\n|tckimlik personal...| 0.0| 0.0|\n|more tool data vi...| 0.0| 1.0|\n|python script tir...| 0.0| 2.0|\n|thirtylol lol src...| 0.0| 2.0|\n|good read impleme...| 0.0| 1.0|\n|html validator te...| 0.0| 2.0|\n|article tune regu...| 0.0| 0.0|\n|chainer dcgan cha...| 0.0| 2.0|\n|credstash utility...| 0.0| 2.0|\n|bot data analysis...| 0.0| 0.0|\n|data migration bl...| 0.0| 0.0|\n|great guide data ...| 0.0| 0.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
"+--------------------+-----+----------+\n| text|label|prediction|\n+--------------------+-----+----------+\n|acolyte warmachin...| 1.0| 1.0|\n|alfred producthun...| 1.0| 0.0|\n|alternative thank...| 1.0| 1.0|\n|america greatest ...| 1.0| 1.0|\n|animatic hell lot...| 1.0| 1.0|\n|annual spring cle...| 1.0| 1.0|\n|anyone think secu...| 1.0| 1.0|\n|are looking someo...| 1.0| 1.0|\n|avoid fine penalt...| 1.0| 1.0|\n|bad news toe brin...| 1.0| 1.0|\n|barton armed scho...| 1.0| 1.0|\n|basic classificat...| 1.0| 1.0|\n|bedfordshire poli...| 1.0| 1.0|\n|beginning week ja...| 1.0| 1.0|\n|being new company...| 1.0| 1.0|\n|best part econ pa...| 1.0| 1.0|\n|best photo human ...| 1.0| 1.0|\n|better bulldog bu...| 1.0| 1.0|\n|better throw same...| 1.0| 1.0|\n|big idea disrupt ...| 1.0| 1.0|\n+--------------------+-----+----------+\nonly showing top 20 rows\n\n"
]
}
],
"source": [
"datasci_df = prediction_df.filter(prediction_df['label']==0.0)\n",
"datasci_df.show()"
"ao_df = prediction_df.filter(prediction_df['label']==1.0)\n",
"ao_df.show()"
]
},
{
Expand All @@ -294,7 +295,9 @@
"metadata": {},
"outputs": [],
"source": [
""
"# TODO Add join back to original text\n",
"# TODO fix raw_classification labels\n",
"# TODO show accuracy measures"
]
}
],
Expand Down
9 changes: 4 additions & 5 deletions preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def lemmatize(data_str):
else:
cleaned_str = cleaned_str + ' ' + lemma
list_pos += 1
return cleaned_str
trimmed_str = cleaned_str.trim()
return trimmed_str


# check to see if a row only contains whitespace
Expand All @@ -134,12 +135,10 @@ def check_blanks(data_str):
# convert the text label into a numeric one
def numeric_label(data_str):
lower_str = data_str.lower()
if lower_str == 'python':
if lower_str == 'datasci':
label = 0.0
elif lower_str == 'hadoop':
elif lower_str == 'wargaming':
label = 1.0
elif lower_str == 'datasci':
label = 2.0
else:
label = 3.0
return label

0 comments on commit 964282d

Please sign in to comment.