From 6b94b3b51cffa72493ed60638d3a0ca50c47b876 Mon Sep 17 00:00:00 2001 From: Dat Le Date: Mon, 23 Nov 2015 10:39:39 +0800 Subject: [PATCH] Added kaggle_geomean.py Ref: https://github.com/MLWave/Kaggle-Ensemble-Guide/issues/7 https://en.wikipedia.org/wiki/Geometric_mean --- README.md | 16 +++++++++++++++- kaggle_geomean.py | 31 +++++++++++++++++++++++++++++++ samples/kaggle_geomean.csv | 6 ++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 kaggle_geomean.py create mode 100644 samples/kaggle_geomean.csv diff --git a/README.md b/README.md index 73d43d2..70ca82e 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,12 @@ For more information: http://mlwave.com/kaggle-ensembling-guide/ parsing: ./samples/method3.csv wrote to ./samples/kaggle_avg.csv + $ python kaggle_geomean.py "./samples/method*.csv" "./samples/kaggle_geomean.csv" + parsing: ./samples/method1.csv + parsing: ./samples/method2.csv + parsing: ./samples/method3.csv + wrote to ./samples/kaggle_geomean.csv + ## Result: ==> ./samples/method1.csv <== @@ -79,4 +85,12 @@ For more information: http://mlwave.com/kaggle-ensembling-guide/ 2,0 3,9 4,2 - 5,3 \ No newline at end of file + 5,3 + + ==> ./samples/kaggle_geomean.csv <== + ImageId,Label + 1,1.587401 + 2,0.000000 + 3,7.862224 + 4,3.301927 + 5,3.000000 diff --git a/kaggle_geomean.py b/kaggle_geomean.py new file mode 100644 index 0000000..589bd18 --- /dev/null +++ b/kaggle_geomean.py @@ -0,0 +1,31 @@ +from __future__ import division +from collections import defaultdict +from glob import glob +import sys +import math + +glob_files = sys.argv[1] +loc_outfile = sys.argv[2] + +def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): + if method == "average": + scores = defaultdict(float) + with open(loc_outfile,"wb") as outfile: + for i, glob_file in enumerate( glob(glob_files) ): + print "parsing:", glob_file + # sort glob_file by first column, ignoring the first line + lines = open(glob_file).readlines() + lines = [lines[0]] + sorted(lines[1:]) + for e, line in enumerate( lines ): + if i == 0 and e == 0: + outfile.write(line) + if e > 0: + row = line.strip().split(",") + if scores[(e,row[0])] == 0: + scores[(e,row[0])] = 1 + scores[(e,row[0])] *= float(row[1]) + for j,k in sorted(scores): + outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1)))) + print("wrote to %s"%loc_outfile) + +kaggle_bag(glob_files, loc_outfile) diff --git a/samples/kaggle_geomean.csv b/samples/kaggle_geomean.csv new file mode 100644 index 0000000..fb6d7f3 --- /dev/null +++ b/samples/kaggle_geomean.csv @@ -0,0 +1,6 @@ +ImageId,Label +1,1.587401 +2,0.000000 +3,7.862224 +4,3.301927 +5,3.000000