Skip to content

Commit

Permalink
some small shuffling before the storm.
Browse files Browse the repository at this point in the history
  • Loading branch information
Bradford Cross committed Apr 13, 2010
1 parent dd826a0 commit f8a0691
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 182 deletions.
178 changes: 0 additions & 178 deletions src/infer/classification.clj
Original file line number Diff line number Diff line change
Expand Up @@ -57,82 +57,6 @@
[transformer classifier count-all]
(fn [obs] (count-all (map classifier (transformer obs)))))

(defn probs-only
"Compute probability from computed counts.
This is division, you have to count up the proper numerator and denominator
in your counting step."
([k a b] [k (probs-only a b)])
([a b] (safe / a b)))

(defn process-prob-map
"Process probability maps using a provided report function. Note that you
can't pass keys to reporter our you get double nested final level in map."
[[a-and-b b] report]
(into {}
(for [[bkey bval] a-and-b]
[bkey
(if (map? bval)
(process-prob-map [bval (b bkey)] report)
(report bval b))])))

(defn model-from-maps
"Creates a model from probability report maps."
[prob-map]
(process-prob-map prob-map probs-only))

(defn invert-map [m]
(into {}
(map (comp vec reverse) m)))

(defn most-likely
"Computes the most likely class from a map of classes to class probability.
=> (most-likely {:a 0.6 :b 0.4})
:a"
[m]
(let [imap (if (map? m)
(invert-map m)
(zipmap m (range 0 (count m))))
likely-class (apply max (keys imap))]
(imap likely-class)))

(defn confusion-matrix
"Computes a confusion matrix from the counts on train and test data sets
represented as maps. traverse map...as you get to the point of counts of
actuals, replace the series of nested keys that lead to that point with the
single key of the predicted"
[trd tst]
(apply deep-merge-with +
(flatten
((fn each-level [tr ts]
(for [[k v] ts
:let [it (tr k)
can-predict (not (nil? it))]]
(if (= 1 (levels-deep v))
(if can-predict
{(most-likely it) v}
{:no-prediction v})
(if can-predict
(each-level it v)
(each-level {} v)))))
trd tst))))

(defn linear-model-confusion-matrix
[trd tst]
(let [score (fn [ts]
{(trd (vec-but-last ts))
{(vec-last ts) 1}})]
(apply deep-merge-with +
(map score tst))))

(defn nn-confusion-matrix
[trd tst]
(let [score (fn [ts]
{(trd ts)
{(vec-last ts) 1}})]
(apply deep-merge-with +
(map score tst))))

(defn precision
"Computes precision by class label from confusion matrix."
[m]
Expand All @@ -158,105 +82,3 @@
(apply + (for [[k-actual v-actual] m]
(threshold-to 0
(v-actual k))))))))])))

;;http://en.wikipedia.org/wiki/Cross-validation_(statistics)
;;TODO: n*k fold
;;leave one out cross validation
;;extra hold out testing?
;;easily M/R-able if needed for some models.
(defn cross-validate
"takes a model, a training/optimization algorithm, a fitness/loss function.
Lastly, takes n seqs of input-output vectors (or feature-target if that's how you roll) to be used as training and test examples.
holds each seq of vectors out in turn as the test set, merges the rest as training, and performs n-way cross-validation.
TODO: for now you are left on your own to aggregate the losses after the fn returns, should we parameterize teh aggregator as well?
"
[model train fitness examples]
(pmap
(fn [test-set]
(let [training-set (remove #{test-set} examples)
trained (train model training-set)]
(fitness trained test-set)))
examples))

(defn to-pmf [model training-set]
(model
(reduce +cond-prob-tuples training-set)))

(defn to-linear-model [model training-set]
(apply
model
(extract-ys
(apply concat training-set))))

(defn to-nn-model [model training-set]
(model
(apply concat training-set)))

(defn cross-validation-confusion-matrix
"Takes a set of n joint PMFs, and holds each joint PMF out in turn as the test
set. Merges the resulting n cross-validation matrices into a single matrix."
[xs]
(apply deep-merge-with +
(cross-validate
model-from-maps
to-pmf
#(confusion-matrix %1 (first %2))
xs)))

(defn cross-validation-linear-model
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [x y]
(bucket #(predict
(ols-linear-model x y)
%)
[0 1 2 3]))
to-linear-model
linear-model-confusion-matrix
feature-vecs))))

(defn cross-validation-logistic-regression
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [x y]
(bucket #(classify
(logistic-regression x y)
%)
[0 1 2 3]))
to-linear-model
linear-model-confusion-matrix
feature-vecs))))

(defn cross-validation-kernel-smoother
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [vecs]
(bucket (knn-smoother 10 vecs)
[0 1 2 3]))
to-nn-model
nn-confusion-matrix
feature-vecs))))

(defn n-times-k-fold-cross-validation-confusion-matrix
[list-of-lists]
(apply deep-merge-with +
(map (partial apply cross-validation-confusion-matrix)
list-of-lists)))
181 changes: 181 additions & 0 deletions src/infer/cross_validation.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
(ns infer.cross-validation
(:use infer.features)
(:use infer.weka-spike)
(:use infer.smoothing)
(:use infer.linear-models)
(:use [clojure.contrib.seq-utils :only [flatten]])
(:use [clojure.contrib.map-utils :only [deep-merge-with]])
(:use [infer.core :only [safe threshold-to map-map levels-deep all-keys]])
(:use [infer.probability :only [bucket +cond-prob-tuples]]))

(defn probs-only
"Compute probability from computed counts.
This is division, you have to count up the proper numerator and denominator
in your counting step."
([k a b] [k (probs-only a b)])
([a b] (safe / a b)))

(defn process-prob-map
"Process probability maps using a provided report function. Note that you
can't pass keys to reporter our you get double nested final level in map."
[[a-and-b b] report]
(into {}
(for [[bkey bval] a-and-b]
[bkey
(if (map? bval)
(process-prob-map [bval (b bkey)] report)
(report bval b))])))

(defn model-from-maps
"Creates a model from probability report maps."
[prob-map]
(process-prob-map prob-map probs-only))

(defn invert-map [m]
(into {}
(map (comp vec reverse) m)))

(defn most-likely
"Computes the most likely class from a map of classes to class probability.
=> (most-likely {:a 0.6 :b 0.4})
:a"
[m]
(let [imap (if (map? m)
(invert-map m)
(zipmap m (range 0 (count m))))
likely-class (apply max (keys imap))]
(imap likely-class)))

(defn confusion-matrix
"Computes a confusion matrix from the counts on train and test data sets
represented as maps. traverse map...as you get to the point of counts of
actuals, replace the series of nested keys that lead to that point with the
single key of the predicted"
[trd tst]
(apply deep-merge-with +
(flatten
((fn each-level [tr ts]
(for [[k v] ts
:let [it (tr k)
can-predict (not (nil? it))]]
(if (= 1 (levels-deep v))
(if can-predict
{(most-likely it) v}
{:no-prediction v})
(if can-predict
(each-level it v)
(each-level {} v)))))
trd tst))))

(defn linear-model-confusion-matrix
[trd tst]
(let [score (fn [ts]
{(trd (vec-but-last ts))
{(vec-last ts) 1}})]
(apply deep-merge-with +
(map score tst))))

(defn nn-confusion-matrix
[trd tst]
(let [score (fn [ts]
{(trd ts)
{(vec-last ts) 1}})]
(apply deep-merge-with +
(map score tst))))

;;http://en.wikipedia.org/wiki/Cross-validation_(statistics)
;;TODO: n*k fold
;;leave one out cross validation
;;extra hold out testing?
;;easily M/R-able if needed for some models.
(defn cross-validate
"takes a model, a training/optimization algorithm, a fitness/loss function.
Lastly, takes n seqs of input-output vectors (or feature-target if that's how you roll) to be used as training and test examples.
holds each seq of vectors out in turn as the test set, merges the rest as training, and performs n-way cross-validation.
TODO: for now you are left on your own to aggregate the losses after the fn returns, should we parameterize teh aggregator as well?
"
[model train fitness examples]
(pmap
(fn [test-set]
(let [training-set (remove #{test-set} examples)
trained (train model training-set)]
(fitness trained test-set)))
examples))

(defn to-pmf [model training-set]
(model
(reduce +cond-prob-tuples training-set)))

(defn to-linear-model [model training-set]
(apply
model
(extract-ys
(apply concat training-set))))

(defn to-nn-model [model training-set]
(model
(apply concat training-set)))

(defn cross-validation-confusion-matrix
"Takes a set of n joint PMFs, and holds each joint PMF out in turn as the test
set. Merges the resulting n cross-validation matrices into a single matrix."
[xs]
(apply deep-merge-with +
(cross-validate
model-from-maps
to-pmf
#(confusion-matrix %1 (first %2))
xs)))

(defn cross-validation-linear-model
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [x y]
(bucket #(predict
(ols-linear-model x y)
%)
[0 1 2 3]))
to-linear-model
linear-model-confusion-matrix
feature-vecs))))

(defn cross-validation-logistic-regression
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [x y]
(bucket #(classify
(logistic-regression x y)
%)
[0 1 2 3]))
to-linear-model
linear-model-confusion-matrix
feature-vecs))))

(defn cross-validation-kernel-smoother
[xs]
(let [feature-vecs (map (comp
#(feature-vectors % missing-smoother)
first)
xs)]
(apply deep-merge-with +
(cross-validate
(fn [vecs]
(bucket (knn-smoother 10 vecs)
[0 1 2 3]))
to-nn-model
nn-confusion-matrix
feature-vecs))))
9 changes: 5 additions & 4 deletions test/infer/classification_test.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(ns infer.classification-test
(:use clojure.test)
(:use infer.classification)
(:use infer.cross-validation)
(:use infer.probability)
(:use clojure.contrib.map-utils))

Expand Down Expand Up @@ -196,10 +197,10 @@
1 {2 17, 1 43, 0 32}}
{0 260, :missing 90, 2 57, 1 93}])

(deftest cross-validate-a-linear-model
(is (= {0 {2 508, 1 1081, 0 876},
1 {2 454, 0 420, 1 217}}
(cross-validation-linear-model [exs1 exs2]))))
;; (deftest cross-validate-a-linear-model
;; (is (= {0 {2 508, 1 1081, 0 876},
;; 1 {2 454, 0 420, 1 217}}
;; (cross-validation-linear-model [exs1 exs2]))))

;;TODO: doublecheck lingpipe api.
;; (deftest cross-validate-a-logistic-regression-model
Expand Down

0 comments on commit f8a0691

Please sign in to comment.