SPARK-1310: Start adding k-fold cross validation to MLLib [adds kFold to MLUtils & fixes bug in BernoulliSampler]

holdenk · pwendell · commit c3527a333a08 · 2014-04-16T09:33:27.000-07:00
Author: Holden Karau <holden@pigscanfly.ca> Closes apache#18 from holdenk/addkfoldcrossvalidation and squashes the following commits: 208db9b [Holden Karau] Fix a bad space e84f2fc [Holden Karau] Fix the test, we should be looking at the second element instead 6ddbf05 [Holden Karau] swap training and validation order 7157ae9 [Holden Karau] CR feedback 90896c7 [Holden Karau] New line 150889c [Holden Karau] Fix up error messages in the MLUtilsSuite 2cb90b3 [Holden Karau] Fix the names in kFold c702a96 [Holden Karau] Fix imports in MLUtils e187e35 [Holden Karau] Move { up to same line as whenExecuting(random) in RandomSamplerSuite.scala c5b723f [Holden Karau] clean up 7ebe4d5 [Holden Karau] CR feedback, remove unecessary learners (came back during merge mistake) and insert an empty line bb5fa56 [Holden Karau] extra line sadness 163c5b1 [Holden Karau] code review feedback 1.to -> 1 to and folds -> numFolds 5a33f1d [Holden Karau] Code review follow up. e8741a7 [Holden Karau] CR feedback b78804e [Holden Karau] Remove cross validation [TODO in another pull request] 91eae64 [Holden Karau] Consolidate things in mlutils 264502a [Holden Karau] Add a test for the bug that was found with BernoulliSampler not copying the complement param dd0b737 [Holden Karau] Wrap long lines (oops) c0b7fa4 [Holden Karau] Switch FoldedRDD to use BernoulliSampler and PartitionwiseSampledRDD 08f8e4d [Holden Karau] Fix BernoulliSampler to respect complement a751ec6 [Holden Karau] Add k-fold cross validation to MLLib
diff --git a/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala b/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala
@@ -69,7 +69,12 @@ class BernoulliSampler[T](lb: Double, ub: Double, complement: Boolean = false)
     }
   }
 
-  override def clone = new BernoulliSampler[T](lb, ub)
+  /**
+   *  Return a sampler with is the complement of the range specified of the current sampler.
+   */
+  def cloneComplement():  BernoulliSampler[T] = new BernoulliSampler[T](lb, ub, !complement)
+
+  override def clone = new BernoulliSampler[T](lb, ub, complement)
 }
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
@@ -41,21 +41,31 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
         random.nextDouble().andReturn(x)
       }
     }
-    whenExecuting(random)
-    {
+    whenExecuting(random) {
       val sampler = new BernoulliSampler[Int](0.25, 0.55)(random)
       assert(sampler.sample(a.iterator).toList == List(3, 4, 5))
     }
   }
 
+  test("BernoulliSamplerWithRangeInverse") {
+    expecting {
+      for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
+        random.nextDouble().andReturn(x)
+      }
+    }
+    whenExecuting(random) {
+      val sampler = new BernoulliSampler[Int](0.25, 0.55, true)(random)
+      assert(sampler.sample(a.iterator).toList === List(1, 2, 6, 7, 8, 9))
+    }
+  }
+
   test("BernoulliSamplerWithRatio") {
     expecting {
       for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
         random.nextDouble().andReturn(x)
       }
     }
-    whenExecuting(random)
-    {
+    whenExecuting(random) {
       val sampler = new BernoulliSampler[Int](0.35)(random)
       assert(sampler.sample(a.iterator).toList == List(1, 2, 3))
     }
@@ -67,8 +77,7 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
         random.nextDouble().andReturn(x)
       }
     }
-    whenExecuting(random)
-    {
+    whenExecuting(random) {
       val sampler = new BernoulliSampler[Int](0.25, 0.55, true)(random)
       assert(sampler.sample(a.iterator).toList == List(1, 2, 6, 7, 8, 9))
     }
@@ -78,8 +87,7 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
     expecting {
       random.setSeed(10L)
     }
-    whenExecuting(random)
-    {
+    whenExecuting(random) {
       val sampler = new BernoulliSampler[Int](0.2)(random)
       sampler.setSeed(10L)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -17,11 +17,16 @@
 
 package org.apache.spark.mllib.util
 
+import scala.reflect.ClassTag
+
 import breeze.linalg.{Vector => BV, SparseVector => BSV, squaredDistance => breezeSquaredDistance}
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.PartitionwiseSampledRDD
+import org.apache.spark.SparkContext._
+import org.apache.spark.util.random.BernoulliSampler
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.linalg.Vectors
 
@@ -157,6 +162,22 @@ object MLUtils {
     dataStr.saveAsTextFile(dir)
   }
 
+  /**
+   * Return a k element array of pairs of RDDs with the first element of each pair
+   * containing the training data, a complement of the validation data and the second
+   * element, the validation data, containing a unique 1/kth of the data. Where k=numFolds.
+   */
+  def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Int): Array[(RDD[T], RDD[T])] = {
+    val numFoldsF = numFolds.toFloat
+    (1 to numFolds).map { fold =>
+      val sampler = new BernoulliSampler[T]((fold - 1) / numFoldsF, fold / numFoldsF,
+        complement = false)
+      val validation = new PartitionwiseSampledRDD(rdd, sampler, seed)
+      val training = new PartitionwiseSampledRDD(rdd, sampler.cloneComplement(), seed)
+      (training, validation)
+    }.toArray
+  }
+
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
    * if it does not introduce too much numerical error:
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -19,6 +19,9 @@ package org.apache.spark.mllib.util
 
 import java.io.File
 
+import scala.math
+import scala.util.Random
+
 import org.scalatest.FunSuite
 
 import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, norm => breezeNorm,
@@ -93,4 +96,40 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
       case t: Throwable =>
     }
   }
+
+  test("kFold") {
+    val data = sc.parallelize(1 to 100, 2)
+    val collectedData = data.collect().sorted
+    val twoFoldedRdd = MLUtils.kFold(data, 2, 1)
+    assert(twoFoldedRdd(0)._1.collect().sorted === twoFoldedRdd(1)._2.collect().sorted)
+    assert(twoFoldedRdd(0)._2.collect().sorted === twoFoldedRdd(1)._1.collect().sorted)
+    for (folds <- 2 to 10) {
+      for (seed <- 1 to 5) {
+        val foldedRdds = MLUtils.kFold(data, folds, seed)
+        assert(foldedRdds.size === folds)
+        foldedRdds.map { case (training, validation) =>
+          val result = validation.union(training).collect().sorted
+          val validationSize = validation.collect().size.toFloat
+          assert(validationSize > 0, "empty validation data")
+          val p = 1 / folds.toFloat
+          // Within 3 standard deviations of the mean
+          val range = 3 * math.sqrt(100 * p * (1 - p))
+          val expected = 100 * p
+          val lowerBound = expected - range
+          val upperBound = expected + range
+          assert(validationSize > lowerBound,
+            s"Validation data ($validationSize) smaller than expected ($lowerBound)" )
+          assert(validationSize < upperBound,
+            s"Validation data ($validationSize) larger than expected ($upperBound)" )
+          assert(training.collect().size > 0, "empty training data")
+          assert(result ===  collectedData,
+            "Each training+validation set combined should contain all of the data.")
+        }
+        // K fold cross validation should only have each element in the validation set exactly once
+        assert(foldedRdds.map(_._2).reduce((x,y) => x.union(y)).collect().sorted ===
+          data.collect().sorted)
+      }
+    }
+  }
+
 }

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,12 @@ class BernoulliSampler[T](lb: Double, ub: Double, complement: Boolean = false)`
`69`	`69`	`}`
`70`	`70`	`}`
`71`	`71`
`72`		`- override def clone = new BernoulliSampler[T](lb, ub)`
	`72`	`+ /**`
	`73`	`+ * Return a sampler with is the complement of the range specified of the current sampler.`
	`74`	`+ */`
	`75`	`+ def cloneComplement(): BernoulliSampler[T] = new BernoulliSampler[T](lb, ub, !complement)`
	`76`	`+`
	`77`	`+ override def clone = new BernoulliSampler[T](lb, ub, complement)`
`73`	`78`	`}`
`74`	`79`
`75`	`80`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -41,21 +41,31 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar`
`41`	`41`	`random.nextDouble().andReturn(x)`
`42`	`42`	`}`
`43`	`43`	`}`
`44`		`- whenExecuting(random)`
`45`		`- {`
	`44`	`+ whenExecuting(random) {`
`46`	`45`	`val sampler = new BernoulliSampler[Int](0.25, 0.55)(random)`
`47`	`46`	`assert(sampler.sample(a.iterator).toList == List(3, 4, 5))`
`48`	`47`	`}`
`49`	`48`	`}`
`50`	`49`
	`50`	`+ test("BernoulliSamplerWithRangeInverse") {`
	`51`	`+ expecting {`
	`52`	`+ for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {`
	`53`	`+ random.nextDouble().andReturn(x)`
	`54`	`+ }`
	`55`	`+ }`
	`56`	`+ whenExecuting(random) {`
	`57`	`+ val sampler = new BernoulliSampler[Int](0.25, 0.55, true)(random)`
	`58`	`+ assert(sampler.sample(a.iterator).toList === List(1, 2, 6, 7, 8, 9))`
	`59`	`+ }`
	`60`	`+ }`
	`61`	`+`
`51`	`62`	`test("BernoulliSamplerWithRatio") {`
`52`	`63`	`expecting {`
`53`	`64`	`for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {`
`54`	`65`	`random.nextDouble().andReturn(x)`
`55`	`66`	`}`
`56`	`67`	`}`
`57`		`- whenExecuting(random)`
`58`		`- {`
	`68`	`+ whenExecuting(random) {`
`59`	`69`	`val sampler = new BernoulliSampler[Int](0.35)(random)`
`60`	`70`	`assert(sampler.sample(a.iterator).toList == List(1, 2, 3))`
`61`	`71`	`}`
`@@ -67,8 +77,7 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar`
`67`	`77`	`random.nextDouble().andReturn(x)`
`68`	`78`	`}`
`69`	`79`	`}`
`70`		`- whenExecuting(random)`
`71`		`- {`
	`80`	`+ whenExecuting(random) {`
`72`	`81`	`val sampler = new BernoulliSampler[Int](0.25, 0.55, true)(random)`
`73`	`82`	`assert(sampler.sample(a.iterator).toList == List(1, 2, 6, 7, 8, 9))`
`74`	`83`	`}`
`@@ -78,8 +87,7 @@ class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar`
`78`	`87`	`expecting {`
`79`	`88`	`random.setSeed(10L)`
`80`	`89`	`}`
`81`		`- whenExecuting(random)`
`82`		`- {`
	`90`	`+ whenExecuting(random) {`
`83`	`91`	`val sampler = new BernoulliSampler[Int](0.2)(random)`
`84`	`92`	`sampler.setSeed(10L)`
`85`	`93`	`}`