Skip to content

Commit

Permalink
[SPARK-4753][SQL] Use catalyst for partition pruning in newParquet.
Browse files Browse the repository at this point in the history
Author: Michael Armbrust <[email protected]>

Closes apache#3613 from marmbrus/parquetPartitionPruning and squashes the following commits:

4f138f8 [Michael Armbrust] Use catalyst for partition pruning in newParquet.
  • Loading branch information
marmbrus authored and pwendell committed Dec 5, 2014
1 parent fd85253 commit f5801e8
Showing 1 changed file with 28 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job}
import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate

import parquet.hadoop.ParquetInputFormat
import parquet.hadoop.util.ContextUtil
Expand All @@ -31,8 +32,8 @@ import org.apache.spark.{Partition => SparkPartition, Logging}
import org.apache.spark.rdd.{NewHadoopPartition, RDD}

import org.apache.spark.sql.{SQLConf, Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, And, Expression, Attribute}
import org.apache.spark.sql.catalyst.types.{IntegerType, StructField, StructType}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.sources._

import scala.collection.JavaConversions._
Expand Down Expand Up @@ -151,44 +152,41 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
// This is mostly a hack so that we can use the existing parquet filter code.
val requiredColumns = output.map(_.name)
// TODO: Parquet filters should be based on data sources API, not catalyst expressions.
val filters = DataSourceStrategy.selectFilters(predicates)

val job = new Job(sparkContext.hadoopConfiguration)
ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
val jobConf: Configuration = ContextUtil.getConfiguration(job)

val requestedSchema = StructType(requiredColumns.map(schema(_)))

// TODO: Make folder based partitioning a first class citizen of the Data Sources API.
val partitionFilters = filters.collect {
case e @ EqualTo(attr, value) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr=$value")
(p: Partition) => p.partitionValues(attr) == value

case e @ In(attr, values) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr IN ${values.mkString("{", ",", "}")}")
val set = values.toSet
(p: Partition) => set.contains(p.partitionValues(attr))

case e @ GreaterThan(attr, value) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr > $value")
(p: Partition) => p.partitionValues(attr).asInstanceOf[Int] > value.asInstanceOf[Int]

case e @ GreaterThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr >= $value")
(p: Partition) => p.partitionValues(attr).asInstanceOf[Int] >= value.asInstanceOf[Int]
val partitionKeySet = partitionKeys.toSet
val rawPredicate =
predicates
.filter(_.references.map(_.name).toSet.subsetOf(partitionKeySet))
.reduceOption(And)
.getOrElse(Literal(true))

// Translate the predicate so that it reads from the information derived from the
// folder structure
val castedPredicate = rawPredicate transform {
case a: AttributeReference =>
val idx = partitionKeys.indexWhere(a.name == _)
BoundReference(idx, IntegerType, nullable = true)
}

case e @ LessThan(attr, value) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr < $value")
(p: Partition) => p.partitionValues(attr).asInstanceOf[Int] < value.asInstanceOf[Int]
val inputData = new GenericMutableRow(partitionKeys.size)
val pruningCondition = InterpretedPredicate(castedPredicate)

case e @ LessThanOrEqual(attr, value) if partitionKeys.contains(attr) =>
logInfo(s"Parquet scan partition filter: $attr <= $value")
(p: Partition) => p.partitionValues(attr).asInstanceOf[Int] <= value.asInstanceOf[Int]
}
val selectedPartitions =
if (partitionKeys.nonEmpty && predicates.nonEmpty) {
partitions.filter { part =>
inputData(0) = part.partitionValues.values.head
pruningCondition(inputData)
}
} else {
partitions
}

val selectedPartitions = partitions.filter(p => partitionFilters.forall(_(p)))
val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
val selectedFiles = selectedPartitions.flatMap(_.files).map(f => fs.makeQualified(f.getPath))
// FileInputFormat cannot handle empty lists.
Expand Down

0 comments on commit f5801e8

Please sign in to comment.