forked from MLWave/Kaggle-Ensemble-Guide
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
MLWave
committed
Jun 13, 2015
1 parent
bb66f48
commit f56c391
Showing
4 changed files
with
171 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from sklearn import ensemble, cross_validation, preprocessing, linear_model, neighbors, svm, calibration | ||
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score | ||
import numpy as np | ||
import pandas as pd | ||
import random | ||
import md5 | ||
import json | ||
from glob import glob | ||
|
||
def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="", save_test_only="", seed=300373, save_params="", clf_name="XX", generalizers_params=[], minimal_loss=0,return_score=False,minimizer="log_loss"): | ||
|
||
print("\nBlending with classifier:\n\t%s"%(clf)) | ||
folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed)) | ||
print(X_train.shape) | ||
dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0])) | ||
|
||
#iterate through train set and train - predict folds | ||
loss = 0 | ||
for i, (train_index, test_index) in enumerate( folds ): | ||
print("Train Fold %s/%s"%(i+1,nfolds)) | ||
fold_X_train = X_train[train_index] | ||
fold_y_train = y[train_index] | ||
fold_X_test = X_train[test_index] | ||
fold_y_test = y[test_index] | ||
clf.fit(fold_X_train, fold_y_train) | ||
|
||
fold_preds = clf.predict_proba(fold_X_test) | ||
print("Logistic loss: %s"%log_loss(fold_y_test,fold_preds)) | ||
dataset_blend_train[test_index] = fold_preds | ||
if minimizer == "log_loss": | ||
loss += log_loss(fold_y_test,fold_preds) | ||
if minimizer == "accuracy": | ||
fold_preds_a = np.argmax(fold_preds, axis=1) | ||
loss += accuracy_score(fold_y_test,fold_preds_a) | ||
#fold_preds = clf.predict(fold_X_test) | ||
|
||
#loss += accuracy_score(fold_y_test,fold_preds) | ||
|
||
if minimal_loss > 0 and loss > minimal_loss and i == 0: | ||
return False, False | ||
fold_preds = np.argmax(fold_preds, axis=1) | ||
print("Accuracy: %s"%accuracy_score(fold_y_test,fold_preds)) | ||
avg_loss = loss / float(i+1) | ||
print("\nAverage:\t%s\n"%avg_loss) | ||
#predict test set (better to take average on all folds, but this is quicker) | ||
print("Test Fold 1/1") | ||
clf.fit(X_train, y) | ||
dataset_blend_test = clf.predict_proba(X_test) | ||
|
||
if clf_name == "XX": | ||
clf_name = str(clf)[1:3] | ||
|
||
if len(save_preds)>0: | ||
id = md5.new("%s"%str(clf.get_params())).hexdigest() | ||
print("storing meta predictions at: %s"%save_preds) | ||
np.save("%s%s_%s_%s_train.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_train) | ||
np.save("%s%s_%s_%s_test.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_test) | ||
|
||
if len(save_test_only)>0: | ||
id = md5.new("%s"%str(clf.get_params())).hexdigest() | ||
print("storing meta predictions at: %s"%save_test_only) | ||
|
||
dataset_blend_test = clf.predict(X_test) | ||
np.savetxt("%s%s_%s_%s_test.txt"%(save_test_only,clf_name,avg_loss,id),dataset_blend_test) | ||
d = {} | ||
d["stacker"] = clf.get_params() | ||
d["generalizers"] = generalizers_params | ||
with open("%s%s_%s_%s_params.json"%(save_test_only,clf_name,avg_loss, id), 'wb') as f: | ||
json.dump(d, f) | ||
|
||
if len(save_params)>0: | ||
id = md5.new("%s"%str(clf.get_params())).hexdigest() | ||
d = {} | ||
d["name"] = clf_name | ||
d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()} | ||
d["generalizers"] = generalizers_params | ||
with open("%s%s_%s_%s_params.json"%(save_params,clf_name,avg_loss, id), 'wb') as f: | ||
json.dump(d, f) | ||
|
||
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba | ||
if return_score: | ||
return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss | ||
else: | ||
return dataset_blend_train[:,1], dataset_blend_test[:,1] | ||
else: | ||
if return_score: | ||
return dataset_blend_train, dataset_blend_test, avg_loss | ||
else: | ||
return dataset_blend_train, dataset_blend_test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from collections import defaultdict | ||
from glob import glob | ||
import sys | ||
|
||
glob_files = sys.argv[1] | ||
loc_outfile = sys.argv[2] | ||
|
||
def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): | ||
if method == "average": | ||
scores = defaultdict(float) | ||
with open(loc_outfile,"wb") as outfile: | ||
for i, glob_file in enumerate( glob(glob_files) ): | ||
print "parsing:", glob_file | ||
for e, line in enumerate( open(glob_file) ): | ||
if i == 0 and e == 0: | ||
outfile.write(line) | ||
if e > 0: | ||
row = line.strip().split(",") | ||
|
||
|
||
scores[(e,row[0])] += float(row[1]) | ||
for j,k in sorted(scores): | ||
outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1))) | ||
|
||
kaggle_bag(glob_files, loc_outfile) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from __future__ import division | ||
from glob import glob | ||
import sys | ||
from collections import defaultdict | ||
|
||
glob_files = sys.argv[1] | ||
output_file = sys.argv[2] | ||
|
||
print(glob_files) | ||
with open(output_file,"wb") as outfile: | ||
all_ranks = defaultdict(list) | ||
for file_nr, glob_file in enumerate( glob(glob_files) ): | ||
file_ranks = [] | ||
print(glob_file) | ||
for e, line in enumerate( open(glob_file) ): | ||
if e == 0 and file_nr == 0: | ||
outfile.write( line ) | ||
elif e > 0: | ||
r = line.strip().split(",") | ||
file_ranks.append( (float(r[1]), e, r[0]) ) | ||
for rank, item in enumerate( sorted(file_ranks) ): | ||
all_ranks[(item[1],item[2])].append(rank) | ||
average_ranks = [] | ||
for k in sorted(all_ranks): | ||
average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k)) | ||
ranked_ranks = [] | ||
for rank, k in enumerate(sorted(average_ranks)): | ||
ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1))) | ||
for k in sorted(ranked_ranks): | ||
outfile.write("%s,%s\n"%(k[1],k[2])) | ||
print("wrote to %s"%output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from collections import defaultdict, Counter | ||
from glob import glob | ||
import sys | ||
|
||
glob_files = sys.argv[1] | ||
loc_outfile = sys.argv[2] | ||
|
||
def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): | ||
if method == "average": | ||
scores = defaultdict(list) | ||
with open(loc_outfile,"wb") as outfile: | ||
for i, glob_file in enumerate( glob(glob_files) ): | ||
print "parsing:", glob_file | ||
for e, line in enumerate( open(glob_file) ): | ||
if i == 0 and e == 0: | ||
outfile.write(line) | ||
if e > 0: | ||
row = line.strip().split(",") | ||
|
||
|
||
scores[(e,row[0])].append(row[1]) | ||
for j,k in sorted(scores): | ||
outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0])) | ||
print("wrote to %s"%loc_outfile) | ||
|
||
kaggle_bag(glob_files, loc_outfile) |