Skip to content

Commit

Permalink
upd toc
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Dec 7, 2014
1 parent 519619d commit 0d94b99
Showing 1 changed file with 6 additions and 307 deletions.
313 changes: 6 additions & 307 deletions code/classify_lyrics/train_lyrics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
"prompt_number": 1
},
{
"cell_type": "code",
Expand All @@ -32,18 +32,18 @@
"output_type": "stream",
"stream": "stdout",
"text": [
"Sebastian Raschka 02/12/2014 \n",
"Sebastian Raschka 07/12/2014 \n",
"\n",
"CPython 2.7.8\n",
"IPython 2.3.0\n",
"IPython 2.1.0\n",
"\n",
"scikit-learn 0.14.1\n",
"scikit-learn 0.15.2\n",
"nltk 3.0.0\n",
"numpy 1.9.0\n"
"numpy 1.9.1\n"
]
}
],
"prompt_number": 7
"prompt_number": 2
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -2104,307 +2104,6 @@
"<br>\n",
"<br>"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Save classifier"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"lyrics_clf_1000 = final_clf\n",
"\n",
"pickle_out = open('./lyrics_clf_1000_py27.pkl', 'wb')\n",
"pickle.dump(lyrics_clf_1000, pickle_out)\n",
"pickle_out.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"New pickle objects for webapp"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pickle\n",
"\n",
"pickle_out = open('./lyrics_label_encoder.pkl', 'rb')\n",
"le = pickle.load(pickle_out)\n",
"pickle_out.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"with open('./stopwords_eng.txt', 'r') as infile:\n",
" stop_words = infile.read().splitlines()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Porter Stemmer\n",
"\n",
"import nltk\n",
"import string\n",
"import re\n",
"\n",
"\n",
"porter_stemmer = nltk.stem.porter.PorterStemmer()\n",
"\n",
"def porter_tokenizer(text, stemmer=porter_stemmer):\n",
" \"\"\"\n",
" A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) \n",
" and applies the porter stemming algorithm to each of the obtained token. \n",
" Tokens that are only consisting of punctuation characters are removed as well.\n",
" Only tokens that consist of more than one letter are being kept.\n",
" \n",
" Parameters\n",
" ----------\n",
" \n",
" text : `str`. \n",
" A sentence that is to split into words.\n",
" \n",
" Returns\n",
" ----------\n",
" \n",
" no_punct : `str`. \n",
" A list of tokens after stemming and removing Sentence punctuation patterns.\n",
" \n",
" \"\"\"\n",
" lower_txt = text.lower()\n",
" tokens = nltk.wordpunct_tokenize(lower_txt)\n",
" stems = [porter_stemmer.stem(t) for t in tokens]\n",
" no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]\n",
" return no_punct\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')\n",
"X_train = df['lyrics'].values \n",
"y_train = df['mood'].values\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"le = LabelEncoder()\n",
"le.fit(y_train)\n",
"y_train = le.transform(y_train)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'y_train' is not defined",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-f4949c0aa6db>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLabelEncoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0my_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'y_train' is not defined"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"\n",
"\n",
"final_clf = Pipeline([\n",
" ('vect', TfidfVectorizer(\n",
" binary=False,\n",
" stop_words=stop_words,\n",
" ngram_range=(1,1),\n",
" )\n",
" ),\n",
" ('clf', MultinomialNB(alpha=1.0)),\n",
" ])\n",
"final_clf.fit(X_train, y_train)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
"Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,\n",
" charset_error=None, decode_error=u'strict',\n",
" dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), norm...rue,\n",
" vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.externals import joblib\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"import pickle\n",
"\n",
"pickle_out = open('./lyrics_label_encoder_np.pkl', 'wb')\n",
"pickle.dump(le, pickle_out)\n",
"pickle_out.close()\n",
"\n",
"joblib.dump(final_clf, 'lyrics_clf_1000_np.pkl') "
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 27,
"text": [
"['lyrics_clf_1000_np.pkl',\n",
" 'lyrics_clf_1000_np.pkl_01.npy',\n",
" 'lyrics_clf_1000_np.pkl_02.npy',\n",
" 'lyrics_clf_1000_np.pkl_03.npy',\n",
" 'lyrics_clf_1000_np.pkl_04.npy',\n",
" 'lyrics_clf_1000_np.pkl_05.npy',\n",
" 'lyrics_clf_1000_np.pkl_06.npy',\n",
" 'lyrics_clf_1000_np.pkl_07.npy']"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.externals import joblib\n",
"\n",
"final_clf = joblib.load('lyrics_clf_1000_jb.pkl') "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"final_clf.predict(X_train)[:3]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"array([1, 0, 1])"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"joblib.dump(le, 'lyrics_label_encoder_jb.pkl') "
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"['lyrics_label_encoder_jb.pkl', 'lyrics_label_encoder_jb.pkl_01.npy']"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.externals import joblib\n",
"lyrics_label_encoder = joblib.load('lyrics_label_encoder_jb.pkl') "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
Expand Down

0 comments on commit 0d94b99

Please sign in to comment.