[SPARK-19292][SQL] filter with partition columns should be case-insensitive on Hive tables

cloud-fan · gatorsmile · commit 0bf605c2c67c · 2017-01-19T20:09:48.000-08:00
## What changes were proposed in this pull request? When we query a table with a filter on partitioned columns, we will push the partition filter to the metastore to get matched partitions directly. In `HiveExternalCatalog.listPartitionsByFilter`, we assume the column names in partition filter are already normalized and we don't need to consider case sensitivity. However, `HiveTableScanExec` doesn't follow this assumption. This PR fixes it. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes apache#16647 from cloud-fan/bug.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -62,7 +62,7 @@ object FileSourceStrategy extends Strategy with Logging {
       val filterSet = ExpressionSet(filters)
 
       // The attribute name of predicate could be different than the one in schema in case of
-      // case insensitive, we should change them to match the one in schema, so we donot need to
+      // case insensitive, we should change them to match the one in schema, so we do not need to
       // worry about case sensitivity anymore.
       val normalizedFilters = filters.map { e =>
         e transform {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -146,9 +146,19 @@ case class HiveTableScanExec(
         hadoopReader.makeRDDForTable(relation.hiveQlTable)
       }
     } else {
+      // The attribute name of predicate could be different than the one in schema in case of
+      // case insensitive, we should change them to match the one in schema, so we do not need to
+      // worry about case sensitivity anymore.
+      val normalizedFilters = partitionPruningPred.map { e =>
+        e transform {
+          case a: AttributeReference =>
+            a.withName(relation.output.find(_.semanticEquals(a)).get.name)
+        }
+      }
+
       Utils.withDummyCallSite(sqlContext.sparkContext) {
         hadoopReader.makeRDDForPartitionedTable(
-          prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
+          prunePartitions(relation.getHiveQlPartitions(normalizedFilters)))
       }
     }
     val numOutputRows = longMetric("numOutputRows")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2014,4 +2014,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       )
     }
   }
+
+  test("SPARK-19292: filter with partition columns should be case-insensitive on Hive tables") {
+    withTable("tbl") {
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+        sql("CREATE TABLE tbl(i int, j int) USING hive PARTITIONED BY (j)")
+        sql("INSERT INTO tbl PARTITION(j=10) SELECT 1")
+        checkAnswer(spark.table("tbl"), Row(1, 10))
+
+        checkAnswer(sql("SELECT i, j FROM tbl WHERE J=10"), Row(1, 10))
+        checkAnswer(spark.table("tbl").filter($"J" === 10), Row(1, 10))
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -2014,4 +2014,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {`
`2014`	`2014`	`)`
`2015`	`2015`	`}`
`2016`	`2016`	`}`
	`2017`	`+`
	`2018`	`+ test("SPARK-19292: filter with partition columns should be case-insensitive on Hive tables") {`
	`2019`	`+ withTable("tbl") {`
	`2020`	`+ withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {`
	`2021`	`+ sql("CREATE TABLE tbl(i int, j int) USING hive PARTITIONED BY (j)")`
	`2022`	`+ sql("INSERT INTO tbl PARTITION(j=10) SELECT 1")`
	`2023`	`+ checkAnswer(spark.table("tbl"), Row(1, 10))`
	`2024`	`+`
	`2025`	`+ checkAnswer(sql("SELECT i, j FROM tbl WHERE J=10"), Row(1, 10))`
	`2026`	`+ checkAnswer(spark.table("tbl").filter($"J" === 10), Row(1, 10))`
	`2027`	`+ }`
	`2028`	`+ }`
	`2029`	`+ }`
`2017`	`2030`	`}`