Skip to content

Commit

Permalink
[SPARK-11473][ML] R-like summary statistics with intercept for OLS vi…
Browse files Browse the repository at this point in the history
…a normal equation solver

Follow up [SPARK-9836](https://issues.apache.org/jira/browse/SPARK-9836), we should also support summary statistics for ```intercept```.

Author: Yanbo Liang <[email protected]>

Closes apache#9485 from yanboliang/spark-11473.
  • Loading branch information
yanboliang authored and mengxr committed Nov 5, 2015
1 parent b072ff4 commit 9da7cee
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ private[ml] class WeightedLeastSquares(
val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_))
summary.validate()
logInfo(s"Number of instances: ${summary.count}.")
val k = summary.k
val k = if (fitIntercept) summary.k + 1 else summary.k
val triK = summary.triK
val wSum = summary.wSum
val bBar = summary.bBar
Expand All @@ -86,14 +86,6 @@ private[ml] class WeightedLeastSquares(
val aaBar = summary.aaBar
val aaValues = aaBar.values

if (fitIntercept) {
// shift centers
// A^T A - aBar aBar^T
BLAS.spr(-1.0, aBar, aaValues)
// A^T b - bBar aBar
BLAS.axpy(-bBar, aBar, abBar)
}

// add regularization to diagonals
var i = 0
var j = 2
Expand All @@ -111,21 +103,32 @@ private[ml] class WeightedLeastSquares(
j += 1
}

val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values))
val aa = if (fitIntercept) {
Array.concat(aaBar.values, aBar.values, Array(1.0))
} else {
aaBar.values
}
val ab = if (fitIntercept) {
Array.concat(abBar.values, Array(bBar))
} else {
abBar.values
}

val x = CholeskyDecomposition.solve(aa, ab)

val aaInv = CholeskyDecomposition.inverse(aa, k)

val aaInv = CholeskyDecomposition.inverse(aaBar.values, k)
// aaInv is a packed upper triangular matrix, here we get all elements on diagonal
val diagInvAtWA = new DenseVector((1 to k).map { i =>
aaInv(i + (i - 1) * i / 2 - 1) / wSum }.toArray)

// compute intercept
val intercept = if (fitIntercept) {
bBar - BLAS.dot(aBar, x)
val (coefficients, intercept) = if (fitIntercept) {
(new DenseVector(x.slice(0, x.length - 1)), x.last)
} else {
0.0
(new DenseVector(x), 0.0)
}

new WeightedLeastSquaresModel(x, intercept, diagInvAtWA)
new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,8 +511,7 @@ class LinearRegressionSummary private[regression] (
}

/**
* Standard error of estimated coefficients.
* Note that standard error of estimated intercept is not supported currently.
* Standard error of estimated coefficients and intercept.
*/
lazy val coefficientStandardErrors: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
Expand All @@ -532,21 +531,26 @@ class LinearRegressionSummary private[regression] (
}
}

/** T-statistic of estimated coefficients.
* Note that t-statistic of estimated intercept is not supported currently.
*/
/**
* T-statistic of estimated coefficients and intercept.
*/
lazy val tValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
throw new UnsupportedOperationException(
"No t-statistic available for this LinearRegressionModel")
} else {
model.coefficients.toArray.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
val estimate = if (model.getFitIntercept) {
Array.concat(model.coefficients.toArray, Array(model.intercept))
} else {
model.coefficients.toArray
}
estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
}
}

/** Two-sided p-value of estimated coefficients.
* Note that p-value of estimated intercept is not supported currently.
*/
/**
* Two-sided p-value of estimated coefficients and intercept.
*/
lazy val pValues: Array[Double] = {
if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
throw new UnsupportedOperationException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -621,13 +621,13 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
assert(model.summary.objectiveHistory.length == 1)
assert(model.summary.objectiveHistory(0) == 0.0)
val devianceResidualsR = Array(-0.35566, 0.34504)
val seCoefR = Array(0.0011756, 0.0009032)
val tValsR = Array(3998, 7971)
val pValsR = Array(0, 0)
val seCoefR = Array(0.0011756, 0.0009032, 0.0018489)
val tValsR = Array(3998, 7971, 3407)
val pValsR = Array(0, 0, 0)
model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
assert(x._1 ~== x._2 absTol 1E-3) }
assert(x._1 ~== x._2 absTol 1E-5) }
model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
assert(x._1 ~== x._2 absTol 1E-3) }
assert(x._1 ~== x._2 absTol 1E-5) }
model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
}
Expand Down Expand Up @@ -789,9 +789,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
val coefficientsR = Vectors.dense(Array(6.080, -0.600))
val interceptR = 18.080
val devianceResidualsR = Array(-1.358, 1.920)
val seCoefR = Array(5.556, 1.960)
val tValsR = Array(1.094, -0.306)
val pValsR = Array(0.471, 0.811)
val seCoefR = Array(5.556, 1.960, 9.608)
val tValsR = Array(1.094, -0.306, 1.882)
val pValsR = Array(0.471, 0.811, 0.311)

assert(model.coefficients ~== coefficientsR absTol 1E-3)
assert(model.intercept ~== interceptR absTol 1E-3)
Expand Down
16 changes: 8 additions & 8 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
>>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal")
>>> model = lr.fit(df)
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
-1.0
>>> model.weights
DenseVector([1.0])
>>> model.intercept
0.0
>>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
True
>>> abs(model.coefficients[0] - 1.0) < 0.001
True
>>> abs(model.intercept - 0.0) < 0.001
True
>>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
>>> model.transform(test1).head().prediction
1.0
>>> abs(model.transform(test1).head().prediction - 1.0) < 0.001
True
>>> lr.setParams("vector")
Traceback (most recent call last):
...
Expand Down

0 comments on commit 9da7cee

Please sign in to comment.