forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
134 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
# https://youtu.be/_5t8ZtRybT8 | ||
|
||
""" | ||
https://pypi.org/project/Boruta/ | ||
pip install Boruta | ||
Dataset: | ||
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic) | ||
""" | ||
|
||
import pandas as pd | ||
from matplotlib import pyplot as plt | ||
import seaborn as sns | ||
|
||
|
||
df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv") | ||
|
||
print(df.describe().T) #Values need to be normalized before fitting. | ||
print(df.isnull().sum()) | ||
#df = df.dropna() | ||
|
||
#Rename Dataset to Label to make it easy to understand | ||
df = df.rename(columns={'Diagnosis':'Label'}) | ||
print(df.dtypes) | ||
|
||
#Understand the data | ||
#sns.countplot(x="Label", data=df) #M - malignant B - benign | ||
|
||
|
||
####### Replace categorical values with numbers######## | ||
df['Label'].value_counts() | ||
|
||
#Define the dependent variable that needs to be predicted (labels) | ||
y = df["Label"].values | ||
|
||
# Encoding categorical data | ||
from sklearn.preprocessing import LabelEncoder | ||
labelencoder = LabelEncoder() | ||
Y = labelencoder.fit_transform(y) # M=1 and B=0 | ||
################################################################# | ||
#Define x and normalize values | ||
|
||
#Define the independent variables. Let's also drop Gender, so we can normalize other data | ||
X = df.drop(labels = ["Label", "ID"], axis=1) | ||
|
||
import numpy as np | ||
feature_names = np.array(X.columns) #Convert dtype string? | ||
|
||
|
||
from sklearn.preprocessing import StandardScaler | ||
scaler = StandardScaler() | ||
scaler.fit(X) | ||
X = scaler.transform(X) | ||
|
||
##Split data into train and test to verify accuracy after fitting the model. | ||
from sklearn.model_selection import train_test_split | ||
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) | ||
|
||
########################################################################### | ||
# Define XGBOOST classifier to be used by Boruta | ||
import xgboost as xgb | ||
model = xgb.XGBClassifier() #For Boruta | ||
|
||
""" | ||
Create shadow features – random features and shuffle values in columns | ||
Train Random Forest / XGBoost and calculate feature importance via mean decrease impurity | ||
Check if real features have higher importance compared to shadow features | ||
Repeat this for every iteration | ||
If original feature performed better, then mark it as important | ||
""" | ||
|
||
from boruta import BorutaPy | ||
|
||
# define Boruta feature selection method | ||
feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1) | ||
|
||
# find all relevant features | ||
feat_selector.fit(X_train, y_train) | ||
|
||
|
||
# check selected features | ||
print(feat_selector.support_) #Should we accept the feature | ||
|
||
# check ranking of features | ||
print(feat_selector.ranking_) #Rank 1 is the best | ||
|
||
# call transform() on X to filter it down to selected features | ||
X_filtered = feat_selector.transform(X_train) #Apply feature selection and return transformed data | ||
|
||
""" | ||
Review the features | ||
""" | ||
# zip feature names, ranks, and decisions | ||
feature_ranks = list(zip(feature_names, | ||
feat_selector.ranking_, | ||
feat_selector.support_)) | ||
|
||
# print the results | ||
for feat in feature_ranks: | ||
print('Feature: {:<30} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2])) | ||
|
||
|
||
############################################################ | ||
#Now use the subset of features to fit XGBoost model on training data | ||
import xgboost as xgb | ||
xgb_model = xgb.XGBClassifier() | ||
|
||
xgb_model.fit(X_filtered, y_train) | ||
|
||
#Now predict on test data using the trained model. | ||
|
||
#First apply feature selector transform to make sure same features are selected from test data | ||
X_test_filtered = feat_selector.transform(X_test) | ||
prediction_xgb = xgb_model.predict(X_test_filtered) | ||
|
||
|
||
#Print overall accuracy | ||
from sklearn import metrics | ||
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_xgb)) | ||
|
||
#Confusion Matrix - verify accuracy of each class | ||
from sklearn.metrics import confusion_matrix | ||
cm = confusion_matrix(y_test, prediction_xgb) | ||
#print(cm) | ||
sns.heatmap(cm, annot=True) | ||
|
||
####################################################### | ||
|
||
|
||
|
||
|
||
|