Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
MLWave committed Jun 13, 2015
1 parent bb66f48 commit f56c391
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 0 deletions.
89 changes: 89 additions & 0 deletions blend_proba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from sklearn import ensemble, cross_validation, preprocessing, linear_model, neighbors, svm, calibration
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd
import random
import md5
import json
from glob import glob

def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="", save_test_only="", seed=300373, save_params="", clf_name="XX", generalizers_params=[], minimal_loss=0,return_score=False,minimizer="log_loss"):

print("\nBlending with classifier:\n\t%s"%(clf))
folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
print(X_train.shape)
dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))

#iterate through train set and train - predict folds
loss = 0
for i, (train_index, test_index) in enumerate( folds ):
print("Train Fold %s/%s"%(i+1,nfolds))
fold_X_train = X_train[train_index]
fold_y_train = y[train_index]
fold_X_test = X_train[test_index]
fold_y_test = y[test_index]
clf.fit(fold_X_train, fold_y_train)

fold_preds = clf.predict_proba(fold_X_test)
print("Logistic loss: %s"%log_loss(fold_y_test,fold_preds))
dataset_blend_train[test_index] = fold_preds
if minimizer == "log_loss":
loss += log_loss(fold_y_test,fold_preds)
if minimizer == "accuracy":
fold_preds_a = np.argmax(fold_preds, axis=1)
loss += accuracy_score(fold_y_test,fold_preds_a)
#fold_preds = clf.predict(fold_X_test)

#loss += accuracy_score(fold_y_test,fold_preds)

if minimal_loss > 0 and loss > minimal_loss and i == 0:
return False, False
fold_preds = np.argmax(fold_preds, axis=1)
print("Accuracy: %s"%accuracy_score(fold_y_test,fold_preds))
avg_loss = loss / float(i+1)
print("\nAverage:\t%s\n"%avg_loss)
#predict test set (better to take average on all folds, but this is quicker)
print("Test Fold 1/1")
clf.fit(X_train, y)
dataset_blend_test = clf.predict_proba(X_test)

if clf_name == "XX":
clf_name = str(clf)[1:3]

if len(save_preds)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
print("storing meta predictions at: %s"%save_preds)
np.save("%s%s_%s_%s_train.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_train)
np.save("%s%s_%s_%s_test.npy"%(save_preds,clf_name,avg_loss,id),dataset_blend_test)

if len(save_test_only)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
print("storing meta predictions at: %s"%save_test_only)

dataset_blend_test = clf.predict(X_test)
np.savetxt("%s%s_%s_%s_test.txt"%(save_test_only,clf_name,avg_loss,id),dataset_blend_test)
d = {}
d["stacker"] = clf.get_params()
d["generalizers"] = generalizers_params
with open("%s%s_%s_%s_params.json"%(save_test_only,clf_name,avg_loss, id), 'wb') as f:
json.dump(d, f)

if len(save_params)>0:
id = md5.new("%s"%str(clf.get_params())).hexdigest()
d = {}
d["name"] = clf_name
d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
d["generalizers"] = generalizers_params
with open("%s%s_%s_%s_params.json"%(save_params,clf_name,avg_loss, id), 'wb') as f:
json.dump(d, f)

if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
if return_score:
return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
else:
return dataset_blend_train[:,1], dataset_blend_test[:,1]
else:
if return_score:
return dataset_blend_train, dataset_blend_test, avg_loss
else:
return dataset_blend_train, dataset_blend_test
25 changes: 25 additions & 0 deletions kaggle_avg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from collections import defaultdict
from glob import glob
import sys

glob_files = sys.argv[1]
loc_outfile = sys.argv[2]

def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
if method == "average":
scores = defaultdict(float)
with open(loc_outfile,"wb") as outfile:
for i, glob_file in enumerate( glob(glob_files) ):
print "parsing:", glob_file
for e, line in enumerate( open(glob_file) ):
if i == 0 and e == 0:
outfile.write(line)
if e > 0:
row = line.strip().split(",")


scores[(e,row[0])] += float(row[1])
for j,k in sorted(scores):
outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1)))

kaggle_bag(glob_files, loc_outfile)
31 changes: 31 additions & 0 deletions kaggle_rankavg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import division
from glob import glob
import sys
from collections import defaultdict

glob_files = sys.argv[1]
output_file = sys.argv[2]

print(glob_files)
with open(output_file,"wb") as outfile:
all_ranks = defaultdict(list)
for file_nr, glob_file in enumerate( glob(glob_files) ):
file_ranks = []
print(glob_file)
for e, line in enumerate( open(glob_file) ):
if e == 0 and file_nr == 0:
outfile.write( line )
elif e > 0:
r = line.strip().split(",")
file_ranks.append( (float(r[1]), e, r[0]) )
for rank, item in enumerate( sorted(file_ranks) ):
all_ranks[(item[1],item[2])].append(rank)
average_ranks = []
for k in sorted(all_ranks):
average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))
ranked_ranks = []
for rank, k in enumerate(sorted(average_ranks)):
ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1)))
for k in sorted(ranked_ranks):
outfile.write("%s,%s\n"%(k[1],k[2]))
print("wrote to %s"%output_file)
26 changes: 26 additions & 0 deletions kaggle_vote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from collections import defaultdict, Counter
from glob import glob
import sys

glob_files = sys.argv[1]
loc_outfile = sys.argv[2]

def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
if method == "average":
scores = defaultdict(list)
with open(loc_outfile,"wb") as outfile:
for i, glob_file in enumerate( glob(glob_files) ):
print "parsing:", glob_file
for e, line in enumerate( open(glob_file) ):
if i == 0 and e == 0:
outfile.write(line)
if e > 0:
row = line.strip().split(",")


scores[(e,row[0])].append(row[1])
for j,k in sorted(scores):
outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0]))
print("wrote to %s"%loc_outfile)

kaggle_bag(glob_files, loc_outfile)

0 comments on commit f56c391

Please sign in to comment.