Skip to content

Commit

Permalink
Merge pull request #1 from sudarsun/initial
Browse files Browse the repository at this point in the history
Add script and datasets
  • Loading branch information
jeshuren authored Jan 10, 2020
2 parents 6040801 + 6a432a7 commit 3ecdefc
Show file tree
Hide file tree
Showing 61 changed files with 116,999 additions and 0 deletions.
348 changes: 348 additions & 0 deletions TOMBoost.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
" \"(https://pypi.org/project/six/).\", DeprecationWarning)\n",
"/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=DeprecationWarning)\n"
]
}
],
"source": [
"import os\n",
"import numpy as np \n",
"\n",
"from sklearn import tree\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import roc_curve, auc\n",
"from sklearn import metrics\n",
"from sklearn.svm import NuSVC\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn import neighbors, datasets\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import precision_recall_fscore_support\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"from numpy import linalg as LA\n",
"import pandas as pd\n",
"from openpyxl import load_workbook\n",
"\n",
"import time\n",
"import eda as eda\n",
"\n",
"from sklearn.preprocessing import normalize\n",
"\n",
"RANDOM_STATE = 42\n",
"\n",
"np.random.seed(RANDOM_STATE)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"file_loc = \"datasets/abalone71.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import csv\n",
"from collections import defaultdict\n",
"\n",
"def construct_line( label, line ):\n",
" new_line = []\n",
" new_line.append(str(label))\n",
"\n",
" for i, item in enumerate( line ):\n",
" new_item = \"%s:%s\" % ( i, item )\n",
" new_line.append( new_item )\n",
" new_line = \" \".join( new_line )\n",
" new_line += \"\\n\"\n",
" return new_line\n",
"\n",
"def csv2libsvm(data, output_file):\n",
" try:\n",
" o = open( output_file, 'w')\n",
"\n",
" reader = data\n",
"\n",
" for line in reader:\n",
" label = len(line)\n",
" new_line = construct_line( label, line )\n",
" o.write( new_line )\n",
" \n",
" return 1\n",
" except:\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class TOMBoost:\n",
" def __init__(self, M, topics = 10, depth=None):\n",
" self.M = M\n",
" self.topics = topics\n",
" self.depth = depth\n",
" self.error = np.zeros((M,1))\n",
"\n",
" def getWeights(self, X):\n",
" try:\n",
" status = csv2libsvm(X_train, file_loc+\".data\")\n",
"\n",
" if status == 1:\n",
" myCmd = \"./lda est 0.01 \" +str(self.topics)+\" settings.txt \" + \"../\" + file_loc +\".data random \" + \\\n",
" \"../\" +file_loc +\"_output/\"\n",
" \n",
" os.chdir(\"lda-c\")\n",
" returned_value = os.system(myCmd) # returns the exit code in unix\n",
" os.chdir(\"../\")\n",
" \n",
" if returned_value != 0:\n",
" print(\"ERROR!!!\")\n",
" return None\n",
"\n",
" X_lda = np.genfromtxt(file_loc +\"_output/final.gamma\",delimiter=' ')\n",
" X_lda = normalize(X_lda,axis=1,norm='l1')\n",
" X_lda = LA.norm(X_lda,axis=1)\n",
" \n",
" return X_lda\n",
" \n",
" except Exception as e:\n",
" print(e)\n",
" return None\n",
" \n",
" def fit(self, X, Y):\n",
" self.models = []\n",
" self.alphas = []\n",
"\n",
" N, _ = X.shape\n",
" \n",
" W = self.getWeights(X)\n",
"\n",
" if not W is None:\n",
" \n",
" W_maj = np.array([W[i] for i in range(X.shape[0]) if Y[i] == -1])\n",
" W_min = np.array([W[i] for i in range(X.shape[0]) if Y[i] == 1])\n",
"\n",
" W_maj = (W_maj - W_maj.min()) / (W_maj.max() - W_maj.min())\n",
" W_min = (W_min - W_min.min()) / (W_min.max() - W_min.min())\n",
"\n",
" W = np.concatenate((W_maj,W_min),axis=0)\n",
"\n",
" W = W/sum(W)\n",
"\n",
" X_maj = np.array([X[i] for i in range(X.shape[0]) if Y[i] == -1])\n",
" X_min = np.array([X[i] for i in range(X.shape[0]) if Y[i] == 1])\n",
"\n",
" X = np.concatenate((X_maj,X_min),axis=0)\n",
" Y = np.concatenate((np.zeros(X_maj.shape[0])-1,np.ones(X_min.shape[0])),axis=0)\n",
"\n",
" for m in range(self.M):\n",
"\n",
" resample_size = X_min.shape[0]\n",
"\n",
" W_maj_norm = W[0:X_maj.shape[0]]/sum(W[0:X_maj.shape[0]])\n",
" W_min_norm = W[X_maj.shape[0]+1:]/sum(W[X_maj.shape[0]+1:])\n",
"\n",
" X_maj_indices = np.random.choice(X_maj.shape[0],resample_size,p=W_maj_norm)\n",
" X_min_indices = np.random.choice(range(X_maj.shape[0]+1,X.shape[0]),resample_size,p=W_min_norm)\n",
"\n",
" X_sampled = np.concatenate((X[X_maj_indices],X[X_min_indices]),axis=0)\n",
" y_sampled = np.concatenate((Y[X_maj_indices],Y[X_min_indices]),axis=0)\n",
" chosen_indices = np.concatenate((X_maj_indices,X_min_indices),axis=0)\n",
"\n",
" tree = DecisionTreeClassifier(max_depth=self.depth, splitter='best')\n",
"\n",
" tree.fit(X_sampled, y_sampled, sample_weight=W[chosen_indices])\n",
"\n",
" P = tree.predict(X)\n",
"\n",
" err = np.sum(W[P != Y])\n",
"\n",
" if err > 0.5:\n",
" m = m - 1\n",
" if err <= 0:\n",
" err = 0.0000001\n",
" else:\n",
" try:\n",
" if (np.log(1 - err) - np.log(err)) == 0 :\n",
" alpha = 0\n",
" else:\n",
" alpha = 0.5 * (np.log(1 - err) - np.log(err))\n",
" W = W * np.exp(-alpha * Y * P) # vectorized form\n",
" W = W / W.sum() # normalize so it sums to 1\n",
" except:\n",
" alpha = 0\n",
" # W = W * np.exp(-alpha * Y * P) # vectorized form\n",
" W = W / W.sum() # normalize so it sums to 1\n",
"\n",
" self.models.append(tree)\n",
" self.alphas.append(alpha)\n",
"\n",
" self.error[m] = err\n",
"\n",
" def predict(self, X):\n",
" N, _ = X.shape\n",
" FX = np.zeros(N)\n",
" for alpha, tree in zip(self.alphas, self.models):\n",
" FX += alpha * tree.predict(X)\n",
" return np.sign(FX), FX\n",
"\n",
" def predict_proba(self, X):\n",
" # if self.alphas == 'SAMME'\n",
" proba = sum(tree.predict_proba(X) * alpha for tree , alpha in zip(self.models,self.alphas) )\n",
"\n",
"\n",
" proba = np.array(proba)\n",
"\n",
"\n",
" proba = proba / sum(self.alphas)\n",
"\n",
" proba = np.exp((1. / (2 - 1)) * proba)\n",
" normalizer = proba.sum(axis=1)[:, np.newaxis]\n",
" normalizer[normalizer == 0.0] = 1.0\n",
" # proba = np.linspace(proba)\n",
" # proba = np.array(proba).astype(float)\n",
" proba = proba / normalizer\n",
"\n",
" # print(proba)\n",
" return proba\n",
"\n",
" def predict_proba_samme(self, X):\n",
" # if self.alphas == 'SAMME.R'\n",
" proba = sum(_samme_proba(est , 2 ,X) for est in self.models )\n",
"\n",
" proba = np.array(proba)\n",
"\n",
" proba = proba / sum(self.alphas)\n",
"\n",
" proba = np.exp((1. / (2 - 1)) * proba)\n",
" normalizer = proba.sum(axis=1)[:, np.newaxis]\n",
" normalizer[normalizer == 0.0] = 1.0\n",
" # proba = np.linspace(proba)\n",
" # proba = np.array(proba).astype(float)\n",
" proba = proba / normalizer\n",
"\n",
" # print('proba = ',proba)\n",
" return proba.astype(float)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"PRECISION - RECALL - F1-Score Report\n",
"(array([0.94485842, 0.25454545]), array([0.83751651, 0.53164557]), array([0.88795518, 0.3442623 ]), array([757, 79]))\n",
"AUC :0.7718007457819841\n",
"PRECISION - RECALL - F1-Score Report\n",
"(array([0.94375 , 0.21538462]), array([0.79788639, 0.53846154]), array([0.86471009, 0.30769231]), array([757, 78]))\n",
"AUC :0.7927632693154489\n"
]
}
],
"source": [
"main = eda.eda()\n",
"main.read_data_csv(file_loc,header_row=0)\n",
"\n",
"if len(main.target)-sum(main.target) > sum(main.target):\n",
" majority_class = 0\n",
"else:\n",
" majority_class = 1\n",
"\n",
"for i in range(len(main.target)):\n",
" if main.target[i] == majority_class:\n",
" main.target[i] = -1\n",
" else:\n",
" main.target[i] = 1\n",
"\n",
"X = main.data\n",
"y = main.target\n",
"\n",
"for t in range(0,4):\n",
" \n",
" print(t) \n",
"\n",
" skf = StratifiedKFold(n_splits=5)\n",
"\n",
" for train_index, test_index in skf.split(X, y):\n",
" \n",
" X_train, X_test = X[train_index], X[test_index]\n",
" y_train, y_test = y[train_index], y[test_index]\n",
"\n",
" try:\n",
" tree = TOMBoost(10)\n",
" tree.fit(X_train, y_train)\n",
" \n",
" except Exception as e:\n",
" import traceback\n",
" traceback.print_exc()\n",
" print(e)\n",
" break\n",
"\n",
" y_predict = tree.predict(X_test)\n",
"\n",
" y_score = tree.predict_proba(X_test)\n",
" y_score = [row[1] for row in y_score]\n",
"\n",
" fpr, tpr, _ = roc_curve(y_test, y_score,pos_label=1)\n",
" \n",
" print('PRECISION - RECALL - F1-Score Report')\n",
" print(metrics.precision_recall_fscore_support(y_test, y_predict[0]))\n",
" print('AUC :' + str(auc(fpr, tpr)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 3ecdefc

Please sign in to comment.