upd toc

rasbt · Dec 7, 2014 · 0d94b99 · 0d94b99
1 parent 519619d
commit 0d94b99
Showing 1 changed file with 6 additions and 307 deletions.
diff --git a/code/classify_lyrics/train_lyrics.ipynb b/code/classify_lyrics/train_lyrics.ipynb
@@ -17,7 +17,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 6
+     "prompt_number": 1
     },
     {
      "cell_type": "code",
@@ -32,18 +32,18 @@
        "output_type": "stream",
        "stream": "stdout",
        "text": [
-        "Sebastian Raschka 02/12/2014 \n",
+        "Sebastian Raschka 07/12/2014 \n",
         "\n",
         "CPython 2.7.8\n",
-        "IPython 2.3.0\n",
+        "IPython 2.1.0\n",
         "\n",
-        "scikit-learn 0.14.1\n",
+        "scikit-learn 0.15.2\n",
         "nltk 3.0.0\n",
-        "numpy 1.9.0\n"
+        "numpy 1.9.1\n"
        ]
       }
      ],
-     "prompt_number": 7
+     "prompt_number": 2
     },
     {
      "cell_type": "markdown",
@@ -2104,307 +2104,6 @@
       "<br>\n",
       "<br>"
      ]
-    },
-    {
-     "cell_type": "heading",
-     "level": 2,
-     "metadata": {},
-     "source": [
-      "Save classifier"
-     ]
-    },
-    {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "[[back to top](#Sections)]"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "lyrics_clf_1000 = final_clf\n",
-      "\n",
-      "pickle_out = open('./lyrics_clf_1000_py27.pkl', 'wb')\n",
-      "pickle.dump(lyrics_clf_1000, pickle_out)\n",
-      "pickle_out.close()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 20
-    },
-    {
-     "cell_type": "heading",
-     "level": 1,
-     "metadata": {},
-     "source": [
-      "New pickle objects for webapp"
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "import pickle\n",
-      "\n",
-      "pickle_out = open('./lyrics_label_encoder.pkl', 'rb')\n",
-      "le = pickle.load(pickle_out)\n",
-      "pickle_out.close()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 13
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "\n",
-      "from sklearn.naive_bayes import MultinomialNB\n",
-      "\n",
-      "with open('./stopwords_eng.txt', 'r') as infile:\n",
-      "    stop_words = infile.read().splitlines()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "# Porter Stemmer\n",
-      "\n",
-      "import nltk\n",
-      "import string\n",
-      "import re\n",
-      "\n",
-      "\n",
-      "porter_stemmer = nltk.stem.porter.PorterStemmer()\n",
-      "\n",
-      "def porter_tokenizer(text, stemmer=porter_stemmer):\n",
-      "    \"\"\"\n",
-      "    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) \n",
-      "    and applies the porter stemming algorithm to each of the obtained token. \n",
-      "    Tokens that are only consisting of punctuation characters are removed as well.\n",
-      "    Only tokens that consist of more than one letter are being kept.\n",
-      "    \n",
-      "    Parameters\n",
-      "    ----------\n",
-      "        \n",
-      "    text : `str`. \n",
-      "      A sentence that is to split into words.\n",
-      "        \n",
-      "    Returns\n",
-      "    ----------\n",
-      "    \n",
-      "    no_punct : `str`. \n",
-      "      A list of tokens after stemming and removing Sentence punctuation patterns.\n",
-      "    \n",
-      "    \"\"\"\n",
-      "    lower_txt = text.lower()\n",
-      "    tokens = nltk.wordpunct_tokenize(lower_txt)\n",
-      "    stems = [porter_stemmer.stem(t) for t in tokens]\n",
-      "    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]\n",
-      "    return no_punct\n",
-      "\n"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "import pandas as pd\n",
-      "df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')\n",
-      "X_train = df['lyrics'].values \n",
-      "y_train = df['mood'].values\n",
-      "\n"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 4
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from sklearn.preprocessing import LabelEncoder\n",
-      "from sklearn.pipeline import Pipeline\n",
-      "\n",
-      "le = LabelEncoder()\n",
-      "le.fit(y_train)\n",
-      "y_train = le.transform(y_train)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "ename": "NameError",
-       "evalue": "name 'y_train' is not defined",
-       "output_type": "pyerr",
-       "traceback": [
-        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-        "\u001b[0;32m<ipython-input-4-f4949c0aa6db>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLabelEncoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0my_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-        "\u001b[0;31mNameError\u001b[0m: name 'y_train' is not defined"
-       ]
-      }
-     ],
-     "prompt_number": 4
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-      "\n",
-      "\n",
-      "\n",
-      "final_clf = Pipeline([\n",
-      "                ('vect', TfidfVectorizer(\n",
-      "                                         binary=False,\n",
-      "                                         stop_words=stop_words,\n",
-      "                                         ngram_range=(1,1),\n",
-      "                                         )\n",
-      "                ),\n",
-      "                ('clf', MultinomialNB(alpha=1.0)),\n",
-      "               ])\n",
-      "final_clf.fit(X_train, y_train)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "metadata": {},
-       "output_type": "pyout",
-       "prompt_number": 26,
-       "text": [
-        "Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,\n",
-        "        charset_error=None, decode_error=u'strict',\n",
-        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
-        "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
-        "        ngram_range=(1, 1), norm...rue,\n",
-        "        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
-       ]
-      }
-     ],
-     "prompt_number": 26
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from sklearn.externals import joblib\n",
-      "from sklearn.pipeline import Pipeline\n",
-      "\n",
-      "import pickle\n",
-      "\n",
-      "pickle_out = open('./lyrics_label_encoder_np.pkl', 'wb')\n",
-      "pickle.dump(le, pickle_out)\n",
-      "pickle_out.close()\n",
-      "\n",
-      "joblib.dump(final_clf, 'lyrics_clf_1000_np.pkl') "
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "metadata": {},
-       "output_type": "pyout",
-       "prompt_number": 27,
-       "text": [
-        "['lyrics_clf_1000_np.pkl',\n",
-        " 'lyrics_clf_1000_np.pkl_01.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_02.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_03.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_04.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_05.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_06.npy',\n",
-        " 'lyrics_clf_1000_np.pkl_07.npy']"
-       ]
-      }
-     ],
-     "prompt_number": 27
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from sklearn.externals import joblib\n",
-      "\n",
-      "final_clf = joblib.load('lyrics_clf_1000_jb.pkl') "
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 3
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "final_clf.predict(X_train)[:3]"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "metadata": {},
-       "output_type": "pyout",
-       "prompt_number": 6,
-       "text": [
-        "array([1, 0, 1])"
-       ]
-      }
-     ],
-     "prompt_number": 6
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "joblib.dump(le, 'lyrics_label_encoder_jb.pkl') "
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "metadata": {},
-       "output_type": "pyout",
-       "prompt_number": 13,
-       "text": [
-        "['lyrics_label_encoder_jb.pkl', 'lyrics_label_encoder_jb.pkl_01.npy']"
-       ]
-      }
-     ],
-     "prompt_number": 13
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from sklearn.externals import joblib\n",
-      "lyrics_label_encoder = joblib.load('lyrics_label_encoder_jb.pkl') "
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
     }
    ],
    "metadata": {}