-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from sudarsun/initial
Add script and datasets
- Loading branch information
Showing
61 changed files
with
116,999 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,348 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n", | ||
" \"(https://pypi.org/project/six/).\", DeprecationWarning)\n", | ||
"/home/jeshuren/projects/tomboost/venv/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", | ||
" warnings.warn(msg, category=DeprecationWarning)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import numpy as np \n", | ||
"\n", | ||
"from sklearn import tree\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.metrics import roc_curve, auc\n", | ||
"from sklearn import metrics\n", | ||
"from sklearn.svm import NuSVC\n", | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"from sklearn import neighbors, datasets\n", | ||
"from sklearn.model_selection import StratifiedKFold\n", | ||
"from sklearn.metrics import precision_recall_fscore_support\n", | ||
"from sklearn.tree import DecisionTreeClassifier\n", | ||
"\n", | ||
"from numpy import linalg as LA\n", | ||
"import pandas as pd\n", | ||
"from openpyxl import load_workbook\n", | ||
"\n", | ||
"import time\n", | ||
"import eda as eda\n", | ||
"\n", | ||
"from sklearn.preprocessing import normalize\n", | ||
"\n", | ||
"RANDOM_STATE = 42\n", | ||
"\n", | ||
"np.random.seed(RANDOM_STATE)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"file_loc = \"datasets/abalone71.csv\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"import csv\n", | ||
"from collections import defaultdict\n", | ||
"\n", | ||
"def construct_line( label, line ):\n", | ||
" new_line = []\n", | ||
" new_line.append(str(label))\n", | ||
"\n", | ||
" for i, item in enumerate( line ):\n", | ||
" new_item = \"%s:%s\" % ( i, item )\n", | ||
" new_line.append( new_item )\n", | ||
" new_line = \" \".join( new_line )\n", | ||
" new_line += \"\\n\"\n", | ||
" return new_line\n", | ||
"\n", | ||
"def csv2libsvm(data, output_file):\n", | ||
" try:\n", | ||
" o = open( output_file, 'w')\n", | ||
"\n", | ||
" reader = data\n", | ||
"\n", | ||
" for line in reader:\n", | ||
" label = len(line)\n", | ||
" new_line = construct_line( label, line )\n", | ||
" o.write( new_line )\n", | ||
" \n", | ||
" return 1\n", | ||
" except:\n", | ||
" return 0" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class TOMBoost:\n", | ||
" def __init__(self, M, topics = 10, depth=None):\n", | ||
" self.M = M\n", | ||
" self.topics = topics\n", | ||
" self.depth = depth\n", | ||
" self.error = np.zeros((M,1))\n", | ||
"\n", | ||
" def getWeights(self, X):\n", | ||
" try:\n", | ||
" status = csv2libsvm(X_train, file_loc+\".data\")\n", | ||
"\n", | ||
" if status == 1:\n", | ||
" myCmd = \"./lda est 0.01 \" +str(self.topics)+\" settings.txt \" + \"../\" + file_loc +\".data random \" + \\\n", | ||
" \"../\" +file_loc +\"_output/\"\n", | ||
" \n", | ||
" os.chdir(\"lda-c\")\n", | ||
" returned_value = os.system(myCmd) # returns the exit code in unix\n", | ||
" os.chdir(\"../\")\n", | ||
" \n", | ||
" if returned_value != 0:\n", | ||
" print(\"ERROR!!!\")\n", | ||
" return None\n", | ||
"\n", | ||
" X_lda = np.genfromtxt(file_loc +\"_output/final.gamma\",delimiter=' ')\n", | ||
" X_lda = normalize(X_lda,axis=1,norm='l1')\n", | ||
" X_lda = LA.norm(X_lda,axis=1)\n", | ||
" \n", | ||
" return X_lda\n", | ||
" \n", | ||
" except Exception as e:\n", | ||
" print(e)\n", | ||
" return None\n", | ||
" \n", | ||
" def fit(self, X, Y):\n", | ||
" self.models = []\n", | ||
" self.alphas = []\n", | ||
"\n", | ||
" N, _ = X.shape\n", | ||
" \n", | ||
" W = self.getWeights(X)\n", | ||
"\n", | ||
" if not W is None:\n", | ||
" \n", | ||
" W_maj = np.array([W[i] for i in range(X.shape[0]) if Y[i] == -1])\n", | ||
" W_min = np.array([W[i] for i in range(X.shape[0]) if Y[i] == 1])\n", | ||
"\n", | ||
" W_maj = (W_maj - W_maj.min()) / (W_maj.max() - W_maj.min())\n", | ||
" W_min = (W_min - W_min.min()) / (W_min.max() - W_min.min())\n", | ||
"\n", | ||
" W = np.concatenate((W_maj,W_min),axis=0)\n", | ||
"\n", | ||
" W = W/sum(W)\n", | ||
"\n", | ||
" X_maj = np.array([X[i] for i in range(X.shape[0]) if Y[i] == -1])\n", | ||
" X_min = np.array([X[i] for i in range(X.shape[0]) if Y[i] == 1])\n", | ||
"\n", | ||
" X = np.concatenate((X_maj,X_min),axis=0)\n", | ||
" Y = np.concatenate((np.zeros(X_maj.shape[0])-1,np.ones(X_min.shape[0])),axis=0)\n", | ||
"\n", | ||
" for m in range(self.M):\n", | ||
"\n", | ||
" resample_size = X_min.shape[0]\n", | ||
"\n", | ||
" W_maj_norm = W[0:X_maj.shape[0]]/sum(W[0:X_maj.shape[0]])\n", | ||
" W_min_norm = W[X_maj.shape[0]+1:]/sum(W[X_maj.shape[0]+1:])\n", | ||
"\n", | ||
" X_maj_indices = np.random.choice(X_maj.shape[0],resample_size,p=W_maj_norm)\n", | ||
" X_min_indices = np.random.choice(range(X_maj.shape[0]+1,X.shape[0]),resample_size,p=W_min_norm)\n", | ||
"\n", | ||
" X_sampled = np.concatenate((X[X_maj_indices],X[X_min_indices]),axis=0)\n", | ||
" y_sampled = np.concatenate((Y[X_maj_indices],Y[X_min_indices]),axis=0)\n", | ||
" chosen_indices = np.concatenate((X_maj_indices,X_min_indices),axis=0)\n", | ||
"\n", | ||
" tree = DecisionTreeClassifier(max_depth=self.depth, splitter='best')\n", | ||
"\n", | ||
" tree.fit(X_sampled, y_sampled, sample_weight=W[chosen_indices])\n", | ||
"\n", | ||
" P = tree.predict(X)\n", | ||
"\n", | ||
" err = np.sum(W[P != Y])\n", | ||
"\n", | ||
" if err > 0.5:\n", | ||
" m = m - 1\n", | ||
" if err <= 0:\n", | ||
" err = 0.0000001\n", | ||
" else:\n", | ||
" try:\n", | ||
" if (np.log(1 - err) - np.log(err)) == 0 :\n", | ||
" alpha = 0\n", | ||
" else:\n", | ||
" alpha = 0.5 * (np.log(1 - err) - np.log(err))\n", | ||
" W = W * np.exp(-alpha * Y * P) # vectorized form\n", | ||
" W = W / W.sum() # normalize so it sums to 1\n", | ||
" except:\n", | ||
" alpha = 0\n", | ||
" # W = W * np.exp(-alpha * Y * P) # vectorized form\n", | ||
" W = W / W.sum() # normalize so it sums to 1\n", | ||
"\n", | ||
" self.models.append(tree)\n", | ||
" self.alphas.append(alpha)\n", | ||
"\n", | ||
" self.error[m] = err\n", | ||
"\n", | ||
" def predict(self, X):\n", | ||
" N, _ = X.shape\n", | ||
" FX = np.zeros(N)\n", | ||
" for alpha, tree in zip(self.alphas, self.models):\n", | ||
" FX += alpha * tree.predict(X)\n", | ||
" return np.sign(FX), FX\n", | ||
"\n", | ||
" def predict_proba(self, X):\n", | ||
" # if self.alphas == 'SAMME'\n", | ||
" proba = sum(tree.predict_proba(X) * alpha for tree , alpha in zip(self.models,self.alphas) )\n", | ||
"\n", | ||
"\n", | ||
" proba = np.array(proba)\n", | ||
"\n", | ||
"\n", | ||
" proba = proba / sum(self.alphas)\n", | ||
"\n", | ||
" proba = np.exp((1. / (2 - 1)) * proba)\n", | ||
" normalizer = proba.sum(axis=1)[:, np.newaxis]\n", | ||
" normalizer[normalizer == 0.0] = 1.0\n", | ||
" # proba = np.linspace(proba)\n", | ||
" # proba = np.array(proba).astype(float)\n", | ||
" proba = proba / normalizer\n", | ||
"\n", | ||
" # print(proba)\n", | ||
" return proba\n", | ||
"\n", | ||
" def predict_proba_samme(self, X):\n", | ||
" # if self.alphas == 'SAMME.R'\n", | ||
" proba = sum(_samme_proba(est , 2 ,X) for est in self.models )\n", | ||
"\n", | ||
" proba = np.array(proba)\n", | ||
"\n", | ||
" proba = proba / sum(self.alphas)\n", | ||
"\n", | ||
" proba = np.exp((1. / (2 - 1)) * proba)\n", | ||
" normalizer = proba.sum(axis=1)[:, np.newaxis]\n", | ||
" normalizer[normalizer == 0.0] = 1.0\n", | ||
" # proba = np.linspace(proba)\n", | ||
" # proba = np.array(proba).astype(float)\n", | ||
" proba = proba / normalizer\n", | ||
"\n", | ||
" # print('proba = ',proba)\n", | ||
" return proba.astype(float)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0\n", | ||
"PRECISION - RECALL - F1-Score Report\n", | ||
"(array([0.94485842, 0.25454545]), array([0.83751651, 0.53164557]), array([0.88795518, 0.3442623 ]), array([757, 79]))\n", | ||
"AUC :0.7718007457819841\n", | ||
"PRECISION - RECALL - F1-Score Report\n", | ||
"(array([0.94375 , 0.21538462]), array([0.79788639, 0.53846154]), array([0.86471009, 0.30769231]), array([757, 78]))\n", | ||
"AUC :0.7927632693154489\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"main = eda.eda()\n", | ||
"main.read_data_csv(file_loc,header_row=0)\n", | ||
"\n", | ||
"if len(main.target)-sum(main.target) > sum(main.target):\n", | ||
" majority_class = 0\n", | ||
"else:\n", | ||
" majority_class = 1\n", | ||
"\n", | ||
"for i in range(len(main.target)):\n", | ||
" if main.target[i] == majority_class:\n", | ||
" main.target[i] = -1\n", | ||
" else:\n", | ||
" main.target[i] = 1\n", | ||
"\n", | ||
"X = main.data\n", | ||
"y = main.target\n", | ||
"\n", | ||
"for t in range(0,4):\n", | ||
" \n", | ||
" print(t) \n", | ||
"\n", | ||
" skf = StratifiedKFold(n_splits=5)\n", | ||
"\n", | ||
" for train_index, test_index in skf.split(X, y):\n", | ||
" \n", | ||
" X_train, X_test = X[train_index], X[test_index]\n", | ||
" y_train, y_test = y[train_index], y[test_index]\n", | ||
"\n", | ||
" try:\n", | ||
" tree = TOMBoost(10)\n", | ||
" tree.fit(X_train, y_train)\n", | ||
" \n", | ||
" except Exception as e:\n", | ||
" import traceback\n", | ||
" traceback.print_exc()\n", | ||
" print(e)\n", | ||
" break\n", | ||
"\n", | ||
" y_predict = tree.predict(X_test)\n", | ||
"\n", | ||
" y_score = tree.predict_proba(X_test)\n", | ||
" y_score = [row[1] for row in y_score]\n", | ||
"\n", | ||
" fpr, tpr, _ = roc_curve(y_test, y_score,pos_label=1)\n", | ||
" \n", | ||
" print('PRECISION - RECALL - F1-Score Report')\n", | ||
" print(metrics.precision_recall_fscore_support(y_test, y_predict[0]))\n", | ||
" print('AUC :' + str(auc(fpr, tpr)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.