Skip to content

Commit

Permalink
feat: start ex07
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTexanCodeur committed Nov 15, 2024
1 parent dbe534b commit 0111ade
Show file tree
Hide file tree
Showing 4 changed files with 1,375 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@



import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
from itertools import combinations
import ast
from sklearn.linear_model import LogisticRegression
import seaborn as sn
%matplotlib inline

data_folder = './data/'





columns = ['animal_type', 'intake_year', 'intake_condition', 'intake_number', 'intake_type', 'sex_upon_intake', \
'age_upon_intake_(years)', 'time_in_shelter_days', 'sex_upon_outcome', 'age_upon_outcome_(years)', \
'outcome_type']
original_data = pd.read_csv(data_folder+'aac_intakes_outcomes.csv', usecols=columns)
original_data.head()





data_features['adopted'] = data_features.outcome_type.apply(lambda r: 1 if r=='Adoption' else 0)
data_features.drop("outcome_type", axis = 1, inplace=True)


# Dummy encoding
dummy_data = pd.get_dummies(original_data, columns=['outcome_type', 'sex_upon_outcome','animal_type','intake_condition','intake_type','sex_upon_intake',])
dummy_data


# Standardize
dummy_data =( dummy_data - dummy_data.mean() ) / dummy_data.std()
dummy_data


def split_set(data_to_split, ratio=0.8):
mask = np.random.rand(len(data_to_split)) < ratio
return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]


[train, test] = split_set(dummy_data)






























Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@



import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
from itertools import combinations
import ast
from sklearn.linear_model import LogisticRegression
import seaborn as sn
%matplotlib inline

data_folder = './data/'





columns = ['animal_type', 'intake_year', 'intake_condition', 'intake_number', 'intake_type', 'sex_upon_intake', \
'age_upon_intake_(years)', 'time_in_shelter_days', 'sex_upon_outcome', 'age_upon_outcome_(years)', \
'outcome_type']
original_data = pd.read_csv(data_folder+'aac_intakes_outcomes.csv', usecols=columns)


print('The length of the data with all rows is : {}'.format(len(original_data)))
original_data.dropna(inplace=True)
print('The length of the data without the rows with nan value is: {}'.format(len(original_data)))


data_features = original_data.copy()
data_features['adopted'] = data_features.outcome_type.apply(lambda r: 1 if r=='Adoption' else 0)
data_features.drop("outcome_type", axis = 1, inplace=True)
data_features.head()


def split_set(data_to_split, ratio=0.8):
mask = np.random.rand(len(data_to_split)) < ratio
return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]


[train, test] = split_set(data_features)


categorical_columns = ['sex_upon_outcome', 'animal_type', 'intake_condition',
'intake_type', 'sex_upon_intake']
train_categorical = pd.get_dummies(train, columns=categorical_columns)
train_categorical.columns





# Make sure we use only the features available in the training set
test_categorical = pd.get_dummies(test, columns=categorical_columns)[train_categorical.columns]


train_label=train_categorical.adopted
train_features = train_categorical.drop('adopted', axis=1)
print('Length of the train dataset : {}'.format(len(train)))

test_label=test_categorical.adopted
test_features = test_categorical.drop('adopted', axis=1)
print('Length of the test dataset : {}'.format(len(test)))


means = train_features.mean()
stddevs = train_features.std()

train_features_std = pd.DataFrame()
for c in train_features.columns:
train_features_std[c] = (train_features[c]-means[c])/stddevs[c]

# Use the mean and stddev of the training set
test_features_std = pd.DataFrame()
for c in test_features.columns:
test_features_std[c] = (test_features[c]-means[c])/stddevs[c]

train_features_std.head()





def compute_confusion_matrix(true_label, prediction_proba, decision_threshold=0.5):

predict_label = (prediction_proba[:,1]>decision_threshold).astype(int)

TP = np.sum(np.logical_and(predict_label==1, true_label==1))
TN = np.sum(np.logical_and(predict_label==0, true_label==0))
FP = np.sum(np.logical_and(predict_label==1, true_label==0))
FN = np.sum(np.logical_and(predict_label==0, true_label==1))

confusion_matrix = np.asarray([[TP, FP],
[FN, TN]])
return confusion_matrix


def plot_confusion_matrix(confusion_matrix):
[[TP, FP],[FN, TN]] = confusion_matrix
label = np.asarray([['TP {}'.format(TP), 'FP {}'.format(FP)],
['FN {}'.format(FN), 'TN {}'.format(TN)]])

df_cm = pd.DataFrame(confusion_matrix, index=['Yes', 'No'], columns=['Positive', 'Negative'])

return sn.heatmap(df_cm, cmap='YlOrRd', annot=label, annot_kws={"size": 16}, cbar=False, fmt='')


def compute_all_score(confusion_matrix, t=0.5):
[[TP, FP],[FN, TN]] = confusion_matrix.astype(float)

accuracy = (TP+TN)/np.sum(confusion_matrix)

precision_positive = TP/(TP+FP) if (TP+FP) !=0 else np.nan
precision_negative = TN/(TN+FN) if (TN+FN) !=0 else np.nan

recall_positive = TP/(TP+FN) if (TP+FN) !=0 else np.nan
recall_negative = TN/(TN+FP) if (TN+FP) !=0 else np.nan

F1_score_positive = 2 *(precision_positive*recall_positive)/(precision_positive+recall_positive) if (precision_positive+recall_positive) !=0 else np.nan
F1_score_negative = 2 *(precision_negative*recall_negative)/(precision_negative+recall_negative) if (precision_negative+recall_negative) !=0 else np.nan

return [t, accuracy, precision_positive, recall_positive, F1_score_positive, precision_negative, recall_negative, F1_score_negative]


logistic = LogisticRegression(solver='lbfgs', max_iter=10000)
logistic.fit(train_features_std,train_label)


prediction_proba = logistic.predict_proba(test_features_std)


confusion_matrix_05 = compute_confusion_matrix(test_label, prediction_proba, 0.5 )
plt.figure(figsize = (4,3))
ax = plot_confusion_matrix(confusion_matrix_05)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Confusion matrix for a 0.5 threshold')


[t, accuracy, precision_positive, recall_positive, F1_score_positive, \
precision_negative, recall_negative, F1_score_negative] = compute_all_score(confusion_matrix_05)

print("The accuracy of this model is {0:1.3f}".format(accuracy))
print("For the positive case, the precision is {0:1.3f}, the recall is {1:1.3f} and the F1 score is {2:1.3f}"\
.format(precision_positive, recall_positive, F1_score_positive))
print("For the negative case, the precision is {0:1.3f}, the recall is {1:1.3f} and the F1 score is {2:1.3f}"\
.format(precision_negative, recall_negative, F1_score_negative))





threshold = np.linspace(0, 1, 100)


columns_score_name = ['Threshold', 'Accuracy', 'Precision P', 'Recall P', 'F1 score P', \
'Precision N', 'Recall N', 'F1 score N']
threshold_score = pd.concat([pd.DataFrame([compute_all_score(compute_confusion_matrix(test_label, prediction_proba, t ),t)]\
, columns=columns_score_name) for t in threshold], ignore_index=True)
threshold_score.set_index('Threshold', inplace=True)


threshold_score['Accuracy'].plot(grid=True).set_title('Accuracy')


fig, axs = plt.subplots(nrows=2, ncols=3, sharex=True, sharey=True, figsize=(10,5))

col_plot = ['Precision P', 'Recall P', 'F1 score P', 'Precision N', 'Recall N', 'F1 score N']

major_ticks = np.linspace(0,1,5)

for axe, col in zip(axs.flat, col_plot):
threshold_score[col].plot(ax=axe, grid = True)
axe.set_title(col)
axe.set_xticks(major_ticks)
axe.grid(which='major', alpha=0.5)





logistic = LogisticRegression(solver='lbfgs', max_iter=10000)
logistic.fit(train_features_std, train_label)


tmp = []
for name, value in zip(train_features_std.columns, logistic.coef_[0]):
tmp.append({"name": name, "value": value})

features_coef = pd.DataFrame(tmp).sort_values("value")
features_coef.head()


plt.subplots(figsize=(5,7))
plt.barh(features_coef.name, features_coef.value, alpha=0.6)















Loading

0 comments on commit 0111ade

Please sign in to comment.