forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-36194][SQL] Add a logical plan visitor to propagate the distin…
…ct attributes ### What changes were proposed in this pull request? 1. This pr add a new logical plan visitor named `DistinctKeyVisitor` to find out all the distinct attributes in current logical plan. For example: ```scala spark.sql("CREATE TABLE t(a int, b int, c int) using parquet") spark.sql("SELECT a, b, a % 10, max(c), sum(b) FROM t GROUP BY a, b").queryExecution.analyzed.distinctKeys ``` The output is: {a#1, b#2}. 2. Enhance `RemoveRedundantAggregates` to remove the aggregation if it is groupOnly and the child can guarantee distinct. For example: ```sql set spark.sql.autoBroadcastJoinThreshold=-1; -- avoid PushDownLeftSemiAntiJoin create table t1 using parquet as select id a, id as b from range(10); create table t2 using parquet as select id as a, id as b from range(8); select t11.a, t11.b from (select distinct a, b from t1) t11 left semi join t2 on (t11.a = t2.a) group by t11.a, t11.b; ``` Before this PR: ``` == Optimized Logical Plan == Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) +- Join LeftSemi, (a#6L = a#8L), Statistics(sizeInBytes=1492.0 B) :- Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) : +- Filter isnotnull(a#6L), Statistics(sizeInBytes=1492.0 B) : +- Relation default.t1[a#6L,b#7L] parquet, Statistics(sizeInBytes=1492.0 B) +- Project [a#8L], Statistics(sizeInBytes=984.0 B) +- Filter isnotnull(a#8L), Statistics(sizeInBytes=1476.0 B) +- Relation default.t2[a#8L,b#9L] parquet, Statistics(sizeInBytes=1476.0 B) ``` After this PR: ``` == Optimized Logical Plan == Join LeftSemi, (a#6L = a#8L), Statistics(sizeInBytes=1492.0 B) :- Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) : +- Filter isnotnull(a#6L), Statistics(sizeInBytes=1492.0 B) : +- Relation default.t1[a#6L,b#7L] parquet, Statistics(sizeInBytes=1492.0 B) +- Project [a#8L], Statistics(sizeInBytes=984.0 B) +- Filter isnotnull(a#8L), Statistics(sizeInBytes=1476.0 B) +- Relation default.t2[a#8L,b#9L] parquet, Statistics(sizeInBytes=1476.0 B) ``` ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test and TPC-DS benchmark test. SQL | Before this PR(Seconds) | After this PR(Seconds) -- | -- | -- q14a | 206 | 193 q38 | 59 | 41 q87 | 127 | 113 Closes apache#35779 from wangyum/SPARK-36194. Authored-by: Yuming Wang <[email protected]> Signed-off-by: Yuming Wang <[email protected]>
- Loading branch information
Showing
31 changed files
with
4,770 additions
and
4,643 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
140 changes: 140 additions & 0 deletions
140
...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.catalyst.plans.logical | ||
|
||
import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, ExpressionSet, NamedExpression} | ||
import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys | ||
import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, LeftSemiOrAnti, RightOuter} | ||
|
||
/** | ||
* A visitor pattern for traversing a [[LogicalPlan]] tree and propagate the distinct attributes. | ||
*/ | ||
object DistinctKeyVisitor extends LogicalPlanVisitor[Set[ExpressionSet]] { | ||
|
||
private def projectDistinctKeys( | ||
keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): Set[ExpressionSet] = { | ||
val outputSet = ExpressionSet(projectList.map(_.toAttribute)) | ||
val aliases = projectList.filter(_.isInstanceOf[Alias]) | ||
if (aliases.isEmpty) { | ||
keys.filter(_.subsetOf(outputSet)) | ||
} else { | ||
val aliasedDistinctKeys = keys.map { expressionSet => | ||
expressionSet.map { expression => | ||
expression transform { | ||
case expr: Expression => | ||
// TODO: Expand distinctKeys for redundant aliases on the same expression | ||
aliases | ||
.collectFirst { case a: Alias if a.child.semanticEquals(expr) => a.toAttribute } | ||
.getOrElse(expr) | ||
} | ||
} | ||
} | ||
aliasedDistinctKeys.collect { | ||
case es: ExpressionSet if es.subsetOf(outputSet) => ExpressionSet(es) | ||
} ++ keys.filter(_.subsetOf(outputSet)) | ||
}.filter(_.nonEmpty) | ||
} | ||
|
||
override def default(p: LogicalPlan): Set[ExpressionSet] = Set.empty[ExpressionSet] | ||
|
||
override def visitAggregate(p: Aggregate): Set[ExpressionSet] = { | ||
val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by a, a | ||
projectDistinctKeys(Set(groupingExps), p.aggregateExpressions) | ||
} | ||
|
||
override def visitDistinct(p: Distinct): Set[ExpressionSet] = Set(ExpressionSet(p.output)) | ||
|
||
override def visitExcept(p: Except): Set[ExpressionSet] = | ||
if (!p.isAll) Set(ExpressionSet(p.output)) else default(p) | ||
|
||
override def visitExpand(p: Expand): Set[ExpressionSet] = default(p) | ||
|
||
override def visitFilter(p: Filter): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitGenerate(p: Generate): Set[ExpressionSet] = default(p) | ||
|
||
override def visitGlobalLimit(p: GlobalLimit): Set[ExpressionSet] = { | ||
p.maxRows match { | ||
case Some(value) if value <= 1 => Set(ExpressionSet(p.output)) | ||
case _ => p.child.distinctKeys | ||
} | ||
} | ||
|
||
override def visitIntersect(p: Intersect): Set[ExpressionSet] = { | ||
if (!p.isAll) Set(ExpressionSet(p.output)) else default(p) | ||
} | ||
|
||
override def visitJoin(p: Join): Set[ExpressionSet] = { | ||
p match { | ||
case Join(_, _, LeftSemiOrAnti(_), _, _) => | ||
p.left.distinctKeys | ||
case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, _, left, right, _) | ||
if left.distinctKeys.nonEmpty || right.distinctKeys.nonEmpty => | ||
val rightJoinKeySet = ExpressionSet(rightKeys) | ||
val leftJoinKeySet = ExpressionSet(leftKeys) | ||
joinType match { | ||
case Inner if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) && | ||
right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) => | ||
left.distinctKeys ++ right.distinctKeys | ||
case Inner | LeftOuter if right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) => | ||
p.left.distinctKeys | ||
case Inner | RightOuter if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) => | ||
p.right.distinctKeys | ||
case _ => | ||
default(p) | ||
} | ||
case _ => default(p) | ||
} | ||
} | ||
|
||
override def visitLocalLimit(p: LocalLimit): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitPivot(p: Pivot): Set[ExpressionSet] = default(p) | ||
|
||
override def visitProject(p: Project): Set[ExpressionSet] = { | ||
if (p.child.distinctKeys.nonEmpty) { | ||
projectDistinctKeys(p.child.distinctKeys, p.projectList) | ||
} else { | ||
default(p) | ||
} | ||
} | ||
|
||
override def visitRepartition(p: Repartition): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitRepartitionByExpr(p: RepartitionByExpression): Set[ExpressionSet] = | ||
p.child.distinctKeys | ||
|
||
override def visitSample(p: Sample): Set[ExpressionSet] = { | ||
if (!p.withReplacement) p.child.distinctKeys else default(p) | ||
} | ||
|
||
override def visitScriptTransform(p: ScriptTransformation): Set[ExpressionSet] = default(p) | ||
|
||
override def visitUnion(p: Union): Set[ExpressionSet] = default(p) | ||
|
||
override def visitWindow(p: Window): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitTail(p: Tail): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitSort(p: Sort): Set[ExpressionSet] = p.child.distinctKeys | ||
|
||
override def visitRebalancePartitions(p: RebalancePartitions): Set[ExpressionSet] = | ||
p.child.distinctKeys | ||
|
||
override def visitWithCTE(p: WithCTE): Set[ExpressionSet] = p.plan.distinctKeys | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
.../src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.catalyst.plans.logical | ||
|
||
import org.apache.spark.sql.catalyst.expressions.ExpressionSet | ||
import org.apache.spark.sql.internal.SQLConf.PROPAGATE_DISTINCT_KEYS_ENABLED | ||
|
||
/** | ||
* A trait to add distinct attributes to [[LogicalPlan]]. For example: | ||
* {{{ | ||
* SELECT a, b, SUM(c) FROM Tab1 GROUP BY a, b | ||
* // returns a, b | ||
* }}} | ||
*/ | ||
trait LogicalPlanDistinctKeys { self: LogicalPlan => | ||
lazy val distinctKeys: Set[ExpressionSet] = { | ||
if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) DistinctKeyVisitor.visit(self) else Set.empty | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.