[SPARK-36194][SQL] Add a logical plan visitor to propagate the distin…

…ct attributes ### What changes were proposed in this pull request? 1. This pr add a new logical plan visitor named `DistinctKeyVisitor` to find out all the distinct attributes in current logical plan. For example: ```scala spark.sql("CREATE TABLE t(a int, b int, c int) using parquet") spark.sql("SELECT a, b, a % 10, max(c), sum(b) FROM t GROUP BY a, b").queryExecution.analyzed.distinctKeys ``` The output is: {a#1, b#2}. 2. Enhance `RemoveRedundantAggregates` to remove the aggregation if it is groupOnly and the child can guarantee distinct. For example: ```sql set spark.sql.autoBroadcastJoinThreshold=-1; -- avoid PushDownLeftSemiAntiJoin create table t1 using parquet as select id a, id as b from range(10); create table t2 using parquet as select id as a, id as b from range(8); select t11.a, t11.b from (select distinct a, b from t1) t11 left semi join t2 on (t11.a = t2.a) group by t11.a, t11.b; ``` Before this PR: ``` == Optimized Logical Plan == Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) +- Join LeftSemi, (a#6L = a#8L), Statistics(sizeInBytes=1492.0 B) :- Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) : +- Filter isnotnull(a#6L), Statistics(sizeInBytes=1492.0 B) : +- Relation default.t1[a#6L,b#7L] parquet, Statistics(sizeInBytes=1492.0 B) +- Project [a#8L], Statistics(sizeInBytes=984.0 B) +- Filter isnotnull(a#8L), Statistics(sizeInBytes=1476.0 B) +- Relation default.t2[a#8L,b#9L] parquet, Statistics(sizeInBytes=1476.0 B) ``` After this PR: ``` == Optimized Logical Plan == Join LeftSemi, (a#6L = a#8L), Statistics(sizeInBytes=1492.0 B) :- Aggregate [a#6L, b#7L], [a#6L, b#7L], Statistics(sizeInBytes=1492.0 B) : +- Filter isnotnull(a#6L), Statistics(sizeInBytes=1492.0 B) : +- Relation default.t1[a#6L,b#7L] parquet, Statistics(sizeInBytes=1492.0 B) +- Project [a#8L], Statistics(sizeInBytes=984.0 B) +- Filter isnotnull(a#8L), Statistics(sizeInBytes=1476.0 B) +- Relation default.t2[a#8L,b#9L] parquet, Statistics(sizeInBytes=1476.0 B) ``` ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test and TPC-DS benchmark test. SQL | Before this PR(Seconds) | After this PR(Seconds) -- | -- | -- q14a | 206 | 193 q38 | 59 | 41 q87 | 127 | 113 Closes apache#35779 from wangyum/SPARK-36194. Authored-by: Yuming Wang <[email protected]> Signed-off-by: Yuming Wang <[email protected]>
mantov5 · Mar 14, 2022 · c16a66a · c16a66a
1 parent 0005b41
commit c16a66a
Show file tree

Hide file tree

Showing 31 changed files with 4,770 additions and 4,643 deletions.
diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
@@ -18,9 +18,9 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis.PullOutNondeterministic
-import org.apache.spark.sql.catalyst.expressions.{AliasHelper, AttributeSet}
+import org.apache.spark.sql.catalyst.expressions.{AliasHelper, AttributeSet, ExpressionSet}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.AGGREGATE
 
@@ -47,6 +47,10 @@ object RemoveRedundantAggregates extends Rule[LogicalPlan] with AliasHelper {
       } else {
         newAggregate
       }
+
+    case agg @ Aggregate(groupingExps, _, child)
+        if agg.groupOnly && child.distinctKeys.exists(_.subsetOf(ExpressionSet(groupingExps))) =>
+      Project(agg.aggregateExpressions, child)
   }
 
   private def isLowerRedundant(upper: Aggregate, lower: Aggregate): Boolean = {

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, ExpressionSet, NamedExpression}
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, LeftSemiOrAnti, RightOuter}
+
+/**
+ * A visitor pattern for traversing a [[LogicalPlan]] tree and propagate the distinct attributes.
+ */
+object DistinctKeyVisitor extends LogicalPlanVisitor[Set[ExpressionSet]] {
+
+  private def projectDistinctKeys(
+      keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): Set[ExpressionSet] = {
+    val outputSet = ExpressionSet(projectList.map(_.toAttribute))
+    val aliases = projectList.filter(_.isInstanceOf[Alias])
+    if (aliases.isEmpty) {
+      keys.filter(_.subsetOf(outputSet))
+    } else {
+      val aliasedDistinctKeys = keys.map { expressionSet =>
+        expressionSet.map { expression =>
+          expression transform {
+            case expr: Expression =>
+              // TODO: Expand distinctKeys for redundant aliases on the same expression
+              aliases
+                .collectFirst { case a: Alias if a.child.semanticEquals(expr) => a.toAttribute }
+                .getOrElse(expr)
+          }
+        }
+      }
+      aliasedDistinctKeys.collect {
+        case es: ExpressionSet if es.subsetOf(outputSet) => ExpressionSet(es)
+      } ++ keys.filter(_.subsetOf(outputSet))
+    }.filter(_.nonEmpty)
+  }
+
+  override def default(p: LogicalPlan): Set[ExpressionSet] = Set.empty[ExpressionSet]
+
+  override def visitAggregate(p: Aggregate): Set[ExpressionSet] = {
+    val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by a, a
+    projectDistinctKeys(Set(groupingExps), p.aggregateExpressions)
+  }
+
+  override def visitDistinct(p: Distinct): Set[ExpressionSet] = Set(ExpressionSet(p.output))
+
+  override def visitExcept(p: Except): Set[ExpressionSet] =
+    if (!p.isAll) Set(ExpressionSet(p.output)) else default(p)
+
+  override def visitExpand(p: Expand): Set[ExpressionSet] = default(p)
+
+  override def visitFilter(p: Filter): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitGenerate(p: Generate): Set[ExpressionSet] = default(p)
+
+  override def visitGlobalLimit(p: GlobalLimit): Set[ExpressionSet] = {
+    p.maxRows match {
+      case Some(value) if value <= 1 => Set(ExpressionSet(p.output))
+      case _ => p.child.distinctKeys
+    }
+  }
+
+  override def visitIntersect(p: Intersect): Set[ExpressionSet] = {
+    if (!p.isAll) Set(ExpressionSet(p.output)) else default(p)
+  }
+
+  override def visitJoin(p: Join): Set[ExpressionSet] = {
+    p match {
+      case Join(_, _, LeftSemiOrAnti(_), _, _) =>
+        p.left.distinctKeys
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, _, left, right, _)
+          if left.distinctKeys.nonEmpty || right.distinctKeys.nonEmpty =>
+        val rightJoinKeySet = ExpressionSet(rightKeys)
+        val leftJoinKeySet = ExpressionSet(leftKeys)
+        joinType match {
+          case Inner if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) &&
+            right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) =>
+            left.distinctKeys ++ right.distinctKeys
+          case Inner | LeftOuter if right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) =>
+            p.left.distinctKeys
+          case Inner | RightOuter if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) =>
+            p.right.distinctKeys
+          case _ =>
+            default(p)
+        }
+      case _ => default(p)
+    }
+  }
+
+  override def visitLocalLimit(p: LocalLimit): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitPivot(p: Pivot): Set[ExpressionSet] = default(p)
+
+  override def visitProject(p: Project): Set[ExpressionSet] = {
+    if (p.child.distinctKeys.nonEmpty) {
+      projectDistinctKeys(p.child.distinctKeys, p.projectList)
+    } else {
+      default(p)
+    }
+  }
+
+  override def visitRepartition(p: Repartition): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitRepartitionByExpr(p: RepartitionByExpression): Set[ExpressionSet] =
+    p.child.distinctKeys
+
+  override def visitSample(p: Sample): Set[ExpressionSet] = {
+    if (!p.withReplacement) p.child.distinctKeys else default(p)
+  }
+
+  override def visitScriptTransform(p: ScriptTransformation): Set[ExpressionSet] = default(p)
+
+  override def visitUnion(p: Union): Set[ExpressionSet] = default(p)
+
+  override def visitWindow(p: Window): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitTail(p: Tail): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitSort(p: Sort): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitRebalancePartitions(p: RebalancePartitions): Set[ExpressionSet] =
+    p.child.distinctKeys
+
+  override def visitWithCTE(p: WithCTE): Set[ExpressionSet] = p.plan.distinctKeys
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -31,6 +31,7 @@ abstract class LogicalPlan
   extends QueryPlan[LogicalPlan]
   with AnalysisHelper
   with LogicalPlanStats
+  with LogicalPlanDistinctKeys
   with QueryPlanConstraints
   with Logging {
 

diff --git a/.../src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala b/.../src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.ExpressionSet
+import org.apache.spark.sql.internal.SQLConf.PROPAGATE_DISTINCT_KEYS_ENABLED
+
+/**
+ * A trait to add distinct attributes to [[LogicalPlan]]. For example:
+ * {{{
+ *   SELECT a, b, SUM(c) FROM Tab1 GROUP BY a, b
+ *   // returns a, b
+ * }}}
+ */
+trait LogicalPlanDistinctKeys { self: LogicalPlan =>
+  lazy val distinctKeys: Set[ExpressionSet] = {
+    if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) DistinctKeyVisitor.visit(self) else Set.empty
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -744,6 +744,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PROPAGATE_DISTINCT_KEYS_ENABLED =
+    buildConf("spark.sql.optimizer.propagateDistinctKeys.enabled")
+      .internal()
+      .doc("When true, the query optimizer will propagate a set of distinct attributes from the " +
+        "current node and use it to optimize query.")
+      .version("3.3.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val ESCAPED_STRING_LITERALS = buildConf("spark.sql.parser.escapedStringLiterals")
     .internal()
     .doc("When true, string literals (including regex patterns) remain escaped in our SQL " +