Merge pull request #1 from sudarsun/initial

Add script and datasets
sudarsun · Jan 10, 2020 · 3ecdefc · 3ecdefc
2 parents 6040801 + 6a432a7
commit 3ecdefc
Show file tree

Hide file tree

Showing 61 changed files with 116,999 additions and 0 deletions.
diff --git a/TOMBoost.ipynb b/TOMBoost.ipynb
@@ -0,0 +1,348 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
+ " \"(https://pypi.org/project/six/).\", DeprecationWarning)\n",
+ "/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
+ " warnings.warn(msg, category=DeprecationWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import numpy as np \n",
+ "\n",
+ "from sklearn import tree\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import roc_curve, auc\n",
+ "from sklearn import metrics\n",
+ "from sklearn.svm import NuSVC\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn import neighbors, datasets\n",
+ "from sklearn.model_selection import StratifiedKFold\n",
+ "from sklearn.metrics import precision_recall_fscore_support\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "\n",
+ "from numpy import linalg as LA\n",
+ "import pandas as pd\n",
+ "from openpyxl import load_workbook\n",
+ "\n",
+ "import time\n",
+ "import eda as eda\n",
+ "\n",
+ "from sklearn.preprocessing import normalize\n",
+ "\n",
+ "RANDOM_STATE = 42\n",
+ "\n",
+ "np.random.seed(RANDOM_STATE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_loc = \"datasets/abalone71.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import csv\n",
+ "from collections import defaultdict\n",
+ "\n",
+ "def construct_line( label, line ):\n",
+ " new_line = []\n",
+ " new_line.append(str(label))\n",
+ "\n",
+ " for i, item in enumerate( line ):\n",
+ " new_item = \"%s:%s\" % ( i, item )\n",
+ " new_line.append( new_item )\n",
+ " new_line = \" \".join( new_line )\n",
+ " new_line += \"\\n\"\n",
+ " return new_line\n",
+ "\n",
+ "def csv2libsvm(data, output_file):\n",
+ " try:\n",
+ " o = open( output_file, 'w')\n",
+ "\n",
+ " reader = data\n",
+ "\n",
+ " for line in reader:\n",
+ " label = len(line)\n",
+ " new_line = construct_line( label, line )\n",
+ " o.write( new_line )\n",
+ " \n",
+ " return 1\n",
+ " except:\n",
+ " return 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class TOMBoost:\n",
+ " def __init__(self, M, topics = 10, depth=None):\n",
+ " self.M = M\n",
+ " self.topics = topics\n",
+ " self.depth = depth\n",
+ " self.error = np.zeros((M,1))\n",
+ "\n",
+ " def getWeights(self, X):\n",
+ " try:\n",
+ " status = csv2libsvm(X_train, file_loc+\".data\")\n",
+ "\n",
+ " if status == 1:\n",
+ " myCmd = \"./lda est 0.01 \" +str(self.topics)+\" settings.txt \" + \"../\" + file_loc +\".data random \" + \\\n",
+ " \"../\" +file_loc +\"_output/\"\n",
+ " \n",
+ " os.chdir(\"lda-c\")\n",
+ " returned_value = os.system(myCmd) # returns the exit code in unix\n",
+ " os.chdir(\"../\")\n",
+ " \n",
+ " if returned_value != 0:\n",
+ " print(\"ERROR!!!\")\n",
+ " return None\n",
+ "\n",
+ " X_lda = np.genfromtxt(file_loc +\"_output/final.gamma\",delimiter=' ')\n",
+ " X_lda = normalize(X_lda,axis=1,norm='l1')\n",
+ " X_lda = LA.norm(X_lda,axis=1)\n",
+ " \n",
+ " return X_lda\n",
+ " \n",
+ " except Exception as e:\n",
+ " print(e)\n",
+ " return None\n",
+ " \n",
+ " def fit(self, X, Y):\n",
+ " self.models = []\n",
+ " self.alphas = []\n",
+ "\n",
+ " N, _ = X.shape\n",
+ " \n",
+ " W = self.getWeights(X)\n",
+ "\n",
+ " if not W is None:\n",
+ " \n",
+ " W_maj = np.array([W[i] for i in range(X.shape[0]) if Y[i] == -1])\n",
+ " W_min = np.array([W[i] for i in range(X.shape[0]) if Y[i] == 1])\n",
+ "\n",
+ " W_maj = (W_maj - W_maj.min()) / (W_maj.max() - W_maj.min())\n",
+ " W_min = (W_min - W_min.min()) / (W_min.max() - W_min.min())\n",
+ "\n",
+ " W = np.concatenate((W_maj,W_min),axis=0)\n",
+ "\n",
+ " W = W/sum(W)\n",
+ "\n",
+ " X_maj = np.array([X[i] for i in range(X.shape[0]) if Y[i] == -1])\n",
+ " X_min = np.array([X[i] for i in range(X.shape[0]) if Y[i] == 1])\n",
+ "\n",
+ " X = np.concatenate((X_maj,X_min),axis=0)\n",
+ " Y = np.concatenate((np.zeros(X_maj.shape[0])-1,np.ones(X_min.shape[0])),axis=0)\n",
+ "\n",
+ " for m in range(self.M):\n",
+ "\n",
+ " resample_size = X_min.shape[0]\n",
+ "\n",
+ " W_maj_norm = W[0:X_maj.shape[0]]/sum(W[0:X_maj.shape[0]])\n",
+ " W_min_norm = W[X_maj.shape[0]+1:]/sum(W[X_maj.shape[0]+1:])\n",
+ "\n",
+ " X_maj_indices = np.random.choice(X_maj.shape[0],resample_size,p=W_maj_norm)\n",
+ " X_min_indices = np.random.choice(range(X_maj.shape[0]+1,X.shape[0]),resample_size,p=W_min_norm)\n",
+ "\n",
+ " X_sampled = np.concatenate((X[X_maj_indices],X[X_min_indices]),axis=0)\n",
+ " y_sampled = np.concatenate((Y[X_maj_indices],Y[X_min_indices]),axis=0)\n",
+ " chosen_indices = np.concatenate((X_maj_indices,X_min_indices),axis=0)\n",
+ "\n",
+ " tree = DecisionTreeClassifier(max_depth=self.depth, splitter='best')\n",
+ "\n",
+ " tree.fit(X_sampled, y_sampled, sample_weight=W[chosen_indices])\n",
+ "\n",
+ " P = tree.predict(X)\n",
+ "\n",
+ " err = np.sum(W[P != Y])\n",
+ "\n",
+ " if err > 0.5:\n",
+ " m = m - 1\n",
+ " if err <= 0:\n",
+ " err = 0.0000001\n",
+ " else:\n",
+ " try:\n",
+ " if (np.log(1 - err) - np.log(err)) == 0 :\n",
+ " alpha = 0\n",
+ " else:\n",
+ " alpha = 0.5 * (np.log(1 - err) - np.log(err))\n",
+ " W = W * np.exp(-alpha * Y * P) # vectorized form\n",
+ " W = W / W.sum() # normalize so it sums to 1\n",
+ " except:\n",
+ " alpha = 0\n",
+ " # W = W * np.exp(-alpha * Y * P) # vectorized form\n",
+ " W = W / W.sum() # normalize so it sums to 1\n",
+ "\n",
+ " self.models.append(tree)\n",
+ " self.alphas.append(alpha)\n",
+ "\n",
+ " self.error[m] = err\n",
+ "\n",
+ " def predict(self, X):\n",
+ " N, _ = X.shape\n",
+ " FX = np.zeros(N)\n",
+ " for alpha, tree in zip(self.alphas, self.models):\n",
+ " FX += alpha * tree.predict(X)\n",
+ " return np.sign(FX), FX\n",
+ "\n",
+ " def predict_proba(self, X):\n",
+ " # if self.alphas == 'SAMME'\n",
+ " proba = sum(tree.predict_proba(X) * alpha for tree , alpha in zip(self.models,self.alphas) )\n",
+ "\n",
+ "\n",
+ " proba = np.array(proba)\n",
+ "\n",
+ "\n",
+ " proba = proba / sum(self.alphas)\n",
+ "\n",
+ " proba = np.exp((1. / (2 - 1)) * proba)\n",
+ " normalizer = proba.sum(axis=1)[:, np.newaxis]\n",
+ " normalizer[normalizer == 0.0] = 1.0\n",
+ " # proba = np.linspace(proba)\n",
+ " # proba = np.array(proba).astype(float)\n",
+ " proba = proba / normalizer\n",
+ "\n",
+ " # print(proba)\n",
+ " return proba\n",
+ "\n",
+ " def predict_proba_samme(self, X):\n",
+ " # if self.alphas == 'SAMME.R'\n",
+ " proba = sum(_samme_proba(est , 2 ,X) for est in self.models )\n",
+ "\n",
+ " proba = np.array(proba)\n",
+ "\n",
+ " proba = proba / sum(self.alphas)\n",
+ "\n",
+ " proba = np.exp((1. / (2 - 1)) * proba)\n",
+ " normalizer = proba.sum(axis=1)[:, np.newaxis]\n",
+ " normalizer[normalizer == 0.0] = 1.0\n",
+ " # proba = np.linspace(proba)\n",
+ " # proba = np.array(proba).astype(float)\n",
+ " proba = proba / normalizer\n",
+ "\n",
+ " # print('proba = ',proba)\n",
+ " return proba.astype(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n",
+ "PRECISION - RECALL - F1-Score Report\n",
+ "(array([0.94485842, 0.25454545]), array([0.83751651, 0.53164557]), array([0.88795518, 0.3442623 ]), array([757, 79]))\n",
+ "AUC :0.7718007457819841\n",
+ "PRECISION - RECALL - F1-Score Report\n",
+ "(array([0.94375 , 0.21538462]), array([0.79788639, 0.53846154]), array([0.86471009, 0.30769231]), array([757, 78]))\n",
+ "AUC :0.7927632693154489\n"
+ ]
+ }
+ ],
+ "source": [
+ "main = eda.eda()\n",
+ "main.read_data_csv(file_loc,header_row=0)\n",
+ "\n",
+ "if len(main.target)-sum(main.target) > sum(main.target):\n",
+ " majority_class = 0\n",
+ "else:\n",
+ " majority_class = 1\n",
+ "\n",
+ "for i in range(len(main.target)):\n",
+ " if main.target[i] == majority_class:\n",
+ " main.target[i] = -1\n",
+ " else:\n",
+ " main.target[i] = 1\n",
+ "\n",
+ "X = main.data\n",
+ "y = main.target\n",
+ "\n",
+ "for t in range(0,4):\n",
+ " \n",
+ " print(t) \n",
+ "\n",
+ " skf = StratifiedKFold(n_splits=5)\n",
+ "\n",
+ " for train_index, test_index in skf.split(X, y):\n",
+ " \n",
+ " X_train, X_test = X[train_index], X[test_index]\n",
+ " y_train, y_test = y[train_index], y[test_index]\n",
+ "\n",
+ " try:\n",
+ " tree = TOMBoost(10)\n",
+ " tree.fit(X_train, y_train)\n",
+ " \n",
+ " except Exception as e:\n",
+ " import traceback\n",
+ " traceback.print_exc()\n",
+ " print(e)\n",
+ " break\n",
+ "\n",
+ " y_predict = tree.predict(X_test)\n",
+ "\n",
+ " y_score = tree.predict_proba(X_test)\n",
+ " y_score = [row[1] for row in y_score]\n",
+ "\n",
+ " fpr, tpr, _ = roc_curve(y_test, y_score,pos_label=1)\n",
+ " \n",
+ " print('PRECISION - RECALL - F1-Score Report')\n",
+ " print(metrics.precision_recall_fscore_support(y_test, y_predict[0]))\n",
+ " print('AUC :' + str(auc(fpr, tpr)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}