From 49e8bd9d7be91be1f6b23e1e929623cffbd126d2 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 8 Jun 2021 13:56:59 +0800 Subject: [PATCH 01/37] Support optimize skew join even if introduce extra shuffle --- .../apache/spark/sql/internal/SQLConf.scala | 6 ++ .../adaptive/AdaptiveSparkPlanExec.scala | 55 ++++++++++++++++-- .../adaptive/OptimizeSkewedJoin.scala | 19 +----- .../adaptive/SkewJoinAwareCost.scala | 58 +++++++++++++++++++ .../adaptive/AdaptiveQueryExecSuite.scala | 35 +++++++++++ 5 files changed, 153 insertions(+), 20 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3aed3c274fc76..f85a0b7d4d189 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -653,6 +653,12 @@ object SQLConf { .booleanConf .createWithDefault(true) + val ADAPTIVE_FORCE_ENABLE_SKEW_JOIN = buildConf("spark.sql.adaptive.forceEnableSkewJoin") + .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.") + .version("3.2.0") + .booleanConf + .createWithDefault(false) + val SUBEXPRESSION_ELIMINATION_ENABLED = buildConf("spark.sql.subexpressionElimination.enabled") .internal() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index cbf70e37ce961..088cb677e5a82 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -88,6 +88,12 @@ case class AdaptiveSparkPlanExec( private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( RemoveRedundantProjects, EnsureRequirements, + // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of + // skew join and extra shuffle nodes. + OptimizeSkewedJoin, + // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the + // output partitioning + EnsureRequirements, RemoveRedundantSorts, DisableUnnecessaryBucketedScan ) ++ context.session.sessionState.queryStagePrepRules @@ -97,8 +103,6 @@ case class AdaptiveSparkPlanExec( @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq( PlanAdaptiveDynamicPruningFilters(this), ReuseAdaptiveSubquery(context.subqueryCache), - // Skew join does not handle `CustomShuffleReader` so needs to be applied first. - OptimizeSkewedJoin, OptimizeSkewInRebalancePartitions, CoalesceShufflePartitions(context.session), // `OptimizeLocalShuffleReader` needs to make use of 'CustomShuffleReaderExec.partitionSpecs' @@ -113,6 +117,19 @@ case class AdaptiveSparkPlanExec( CollapseCodegenStages() ) + // OptimizeSkewedJoin has moved into preparation rules, so we should make + // finalPreparationStageRules same as finalStageOptimizerRules + private def finalPreparationStageRules: Seq[Rule[SparkPlan]] = { + val origins = inputPlan.collect { + case s: ShuffleExchangeLike => s.shuffleOrigin + } + (preprocessingRules ++ queryStagePreparationRules).filter { + case c: CustomShuffleReaderRule => + origins.forall(c.supportedShuffleOrigins.contains) + case _ => true + } + } + // The partitioning of the query output depends on the shuffle(s) in the final stage. If the // original plan contains a repartition operator, we need to preserve the specified partitioning, // whether or not the repartition-introduced shuffle is optimized out because of an underlying @@ -130,7 +147,12 @@ case class AdaptiveSparkPlanExec( } } - @transient private val costEvaluator = SimpleCostEvaluator + @transient private val costEvaluator = + if (conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) { + SkewJoinAwareCostEvaluator + } else { + SimpleCostEvaluator + } @transient val initialPlan = context.session.withActive { applyPhysicalRules( @@ -593,6 +615,25 @@ case class AdaptiveSparkPlanExec( logicalPlan } + private def isFinalStage(sparkPlan: SparkPlan): Boolean = { + sparkPlan match { + // avoid top level node is Exchange + case _: Exchange => false + case plan => + // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage + // and all query stages are materialized. + plan.find { + case p if p.children.exists( + child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => + p match { + case stage: QueryStageExec if stage.isMaterialized => false + case _ => true + } + case _ => false + }.isEmpty + } + } + /** * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ @@ -600,9 +641,15 @@ case class AdaptiveSparkPlanExec( logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() + val rules = if (isFinalStage(sparkPlan)) { + finalPreparationStageRules + } else { + preprocessingRules ++ queryStagePreparationRules + } + val newPlan = applyPhysicalRules( sparkPlan, - preprocessingRules ++ queryStagePreparationRules, + rules, Some((planChangeLogger, "AQE Replanning"))) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index a284016bfb2ef..96c030fcd237b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -23,7 +23,7 @@ import org.apache.commons.io.FileUtils import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleOrigin} +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleOrigin} import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.internal.SQLConf @@ -52,8 +52,6 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule { override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS) - private val ensureRequirements = EnsureRequirements - /** * A partition is considered as a skewed partition if its size is larger than the median * partition size * SKEW_JOIN_SKEWED_PARTITION_FACTOR and also larger than @@ -248,18 +246,7 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule { // Shuffle // Sort // Shuffle - val optimizePlan = optimizeSkewJoin(plan) - val numShuffles = ensureRequirements.apply(optimizePlan).collect { - case e: ShuffleExchangeExec => e - }.length - - if (numShuffles > 0) { - logDebug("OptimizeSkewedJoin rule is not applied due" + - " to additional shuffles will be introduced.") - plan - } else { - optimizePlan - } + optimizeSkewJoin(plan) } else { plan } @@ -268,7 +255,7 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule { private object ShuffleStage { def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match { - case s: ShuffleQueryStageExec if s.mapStats.isDefined && + case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined && OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) => Some(s) case _ => None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala new file mode 100644 index 0000000000000..a2e101216a548 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike +import org.apache.spark.sql.execution.joins.ShuffledJoin + +/** + * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number + */ +case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost { + override def compare(that: Cost): Int = that match { + case other: SkewJoinAwareCost => + if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) { + // If more skew joins are optimized or less shuffle nodes, it means the cost is lower + -1 + } else if (numShuffles > other.numShuffles) { + 1 + } else { + 0 + } + case _ => + throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString) + } +} + +/** + * A skew join aware implementation of [[CostEvaluator]], which counts the number of + * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan. + */ +object SkewJoinAwareCostEvaluator extends CostEvaluator { + override def evaluateCost(plan: SparkPlan): Cost = { + val shuffleNumber = plan.collect { + case s: ShuffleExchangeLike => s + }.size + val skewJoinNumber = plan.collect { + case j: ShuffledJoin if j.isSkewJoin => j + }.size + SkewJoinAwareCost(shuffleNumber, skewJoinNumber) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 2343a9236e4f4..7b2eab28c51cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1786,6 +1786,7 @@ class AdaptiveQueryExecSuite } } +<<<<<<< HEAD test("SPARK-35650: Coalesce number of partitions by AEQ") { withSQLConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1") { Seq("REPARTITION", "REBALANCE(key)") @@ -1885,4 +1886,38 @@ class AdaptiveQueryExecSuite } } } + + test("SPARK-33832: Support optimize skew join even if introduce extra shuffle") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100", + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100", + SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1", + SQLConf.SHUFFLE_PARTITIONS.key -> "10", + SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN.key -> "true") { + withTempView("skewData1", "skewData2") { + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 3 as key1", "id as value1") + .createOrReplaceTempView("skewData1") + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 1 as key2", "id as value2") + .createOrReplaceTempView("skewData2") + + val (_, adaptive) = runAdaptiveAndVerifyResult( + "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1") + val smj = findTopLevelSortMergeJoin(adaptive) + assert(smj.size == 1 && smj.forall(_.isSkewJoin)) + checkNumLocalShuffleReaders(adaptive, 3) + + val (_, adaptive2) = runAdaptiveAndVerifyResult( + "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") + val smj2 = findTopLevelSortMergeJoin(adaptive2) + assert(smj2.size == 1 && smj2.forall(_.isSkewJoin)) + checkNumLocalShuffleReaders(adaptive2, 3) + } + } + } } From db77ddd6f743ebc61e0a5539a025128b23eb2814 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 8 Jun 2021 17:42:58 +0800 Subject: [PATCH 02/37] EnsureRequirements --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 088cb677e5a82..216b837f241d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -641,7 +641,7 @@ case class AdaptiveSparkPlanExec( logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - val rules = if (isFinalStage(sparkPlan)) { + val rules = if (isFinalStage(EnsureRequirements.apply(sparkPlan))) { finalPreparationStageRules } else { preprocessingRules ++ queryStagePreparationRules From a63cd7286e77ee68dad90f54925373b29f3d6398 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 9 Jun 2021 11:48:07 +0800 Subject: [PATCH 03/37] make a new rules --- .../adaptive/AdaptiveSparkPlanExec.scala | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 216b837f241d8..017ee417f75bc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -88,15 +88,20 @@ case class AdaptiveSparkPlanExec( private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( RemoveRedundantProjects, EnsureRequirements, + RemoveRedundantSorts, + DisableUnnecessaryBucketedScan + ) ++ context.session.sessionState.queryStagePrepRules + + // This list rules are applied between queryStagePreparationRules and estimate physical plan cost + // so that we can support introduce extra shuffle + @transient private val queryStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq( // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of // skew join and extra shuffle nodes. OptimizeSkewedJoin, // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the // output partitioning - EnsureRequirements, - RemoveRedundantSorts, - DisableUnnecessaryBucketedScan - ) ++ context.session.sessionState.queryStagePrepRules + EnsureRequirements + ) // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. @@ -117,13 +122,13 @@ case class AdaptiveSparkPlanExec( CollapseCodegenStages() ) - // OptimizeSkewedJoin has moved into preparation rules, so we should make - // finalPreparationStageRules same as finalStageOptimizerRules - private def finalPreparationStageRules: Seq[Rule[SparkPlan]] = { + // OptimizeSkewedJoin has moved into this rules, so we should follow the finalStageOptimizerRules + // for the final stage. + private def finalStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = { val origins = inputPlan.collect { case s: ShuffleExchangeLike => s.shuffleOrigin } - (preprocessingRules ++ queryStagePreparationRules).filter { + queryStagePreparationWithExtraShuffleRules.filter { case c: CustomShuffleReaderRule => origins.forall(c.supportedShuffleOrigins.contains) case _ => true @@ -641,15 +646,19 @@ case class AdaptiveSparkPlanExec( logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - val rules = if (isFinalStage(EnsureRequirements.apply(sparkPlan))) { - finalPreparationStageRules - } else { - preprocessingRules ++ queryStagePreparationRules - } - val newPlan = applyPhysicalRules( sparkPlan, - rules, + preprocessingRules ++ queryStagePreparationRules, + Some((planChangeLogger, "AQE Replanning"))) + + val preparationWithExtraShuffleRules = if (isFinalStage(newPlan)) { + finalStagePreparationWithExtraShuffleRules + } else { + queryStagePreparationWithExtraShuffleRules + } + val newPlanWithExtraShuffle = applyPhysicalRules( + newPlan, + preparationWithExtraShuffleRules, Some((planChangeLogger, "AQE Replanning"))) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will @@ -661,8 +670,9 @@ case class AdaptiveSparkPlanExec( // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule. val finalPlan = currentPhysicalPlan match { case b: BroadcastExchangeLike - if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan)) - case _ => newPlan + if (!newPlanWithExtraShuffle.isInstanceOf[BroadcastExchangeLike]) => + b.withNewChildren(Seq(newPlanWithExtraShuffle)) + case _ => newPlanWithExtraShuffle } (finalPlan, optimized) From 59a5e4a9ac3f322df504465caba09ec851c99d69 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 10 Jun 2021 13:52:52 +0800 Subject: [PATCH 04/37] fix local reader number --- .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 7b2eab28c51cc..d3726ed31f76e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1916,7 +1916,8 @@ class AdaptiveQueryExecSuite "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") val smj2 = findTopLevelSortMergeJoin(adaptive2) assert(smj2.size == 1 && smj2.forall(_.isSkewJoin)) - checkNumLocalShuffleReaders(adaptive2, 3) + // top level shuffle reader is local + checkNumLocalShuffleReaders(adaptive2, 2) } } } From 9c985da9870e107311cef0383a2a40703e4f4f07 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 2 Jul 2021 12:13:59 +0800 Subject: [PATCH 05/37] more cost --- .../adaptive/AdaptiveSparkPlanExec.scala | 50 +++++++++++-------- .../adaptive/SkewJoinAwareCost.scala | 20 +++++--- .../adaptive/AdaptiveQueryExecSuite.scala | 2 +- 3 files changed, 43 insertions(+), 29 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 017ee417f75bc..63feccb0fe40f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -153,11 +153,7 @@ case class AdaptiveSparkPlanExec( } @transient private val costEvaluator = - if (conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) { - SkewJoinAwareCostEvaluator - } else { - SimpleCostEvaluator - } + SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) @transient val initialPlan = context.session.withActive { applyPhysicalRules( @@ -279,17 +275,26 @@ case class AdaptiveSparkPlanExec( // plans are updated, we can clear the query stage list because at this point the two plans // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) - val (newPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) + val (reOptimizePhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) + val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizePhysicalPlan) val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) - val newCost = costEvaluator.evaluateCost(newPhysicalPlan) - if (newCost < origCost || - (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { + val newCost = costEvaluator.evaluateCost(reOptimizePhysicalPlan) + val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle) + def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = { logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan stagesToReplace = Seq.empty[QueryStageExec] } + + if (extraShuffleCost < newCost || + (extraShuffleCost == newCost && planWithExtraShuffle != reOptimizePhysicalPlan)) { + updateCurrentPlan(planWithExtraShuffle) + } else if (newCost < origCost || + (newCost == origCost && currentPhysicalPlan != reOptimizePhysicalPlan)) { + updateCurrentPlan(reOptimizePhysicalPlan) + } // Now that some stages have finished, we can try creating new stages. result = createQueryStages(currentPhysicalPlan) } @@ -651,16 +656,6 @@ case class AdaptiveSparkPlanExec( preprocessingRules ++ queryStagePreparationRules, Some((planChangeLogger, "AQE Replanning"))) - val preparationWithExtraShuffleRules = if (isFinalStage(newPlan)) { - finalStagePreparationWithExtraShuffleRules - } else { - queryStagePreparationWithExtraShuffleRules - } - val newPlanWithExtraShuffle = applyPhysicalRules( - newPlan, - preparationWithExtraShuffleRules, - Some((planChangeLogger, "AQE Replanning"))) - // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated @@ -670,14 +665,25 @@ case class AdaptiveSparkPlanExec( // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule. val finalPlan = currentPhysicalPlan match { case b: BroadcastExchangeLike - if (!newPlanWithExtraShuffle.isInstanceOf[BroadcastExchangeLike]) => - b.withNewChildren(Seq(newPlanWithExtraShuffle)) - case _ => newPlanWithExtraShuffle + if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan)) + case _ => newPlan } (finalPlan, optimized) } + private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = { + val preparationWithExtraShuffleRules = if (isFinalStage(sparkPlan)) { + finalStagePreparationWithExtraShuffleRules + } else { + queryStagePreparationWithExtraShuffleRules + } + applyPhysicalRules( + sparkPlan, + preparationWithExtraShuffleRules, + Some((planChangeLogger, "AQE Replanning"))) + } + /** * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala index a2e101216a548..c2e9642171588 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala @@ -25,7 +25,9 @@ import org.apache.spark.sql.execution.joins.ShuffledJoin /** * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number */ -case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost { +case class SkewJoinAwareCost( + numShuffles: Int, + numSkewJoins: Int) extends Cost { override def compare(that: Cost): Int = that match { case other: SkewJoinAwareCost => if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) { @@ -36,6 +38,7 @@ case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost { } else { 0 } + case _ => throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString) } @@ -45,14 +48,19 @@ case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost { * A skew join aware implementation of [[CostEvaluator]], which counts the number of * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan. */ -object SkewJoinAwareCostEvaluator extends CostEvaluator { +case class SkewJoinAwareCostEvaluator(forceOptimizeSkewJoin: Boolean) extends CostEvaluator { override def evaluateCost(plan: SparkPlan): Cost = { val shuffleNumber = plan.collect { case s: ShuffleExchangeLike => s }.size - val skewJoinNumber = plan.collect { - case j: ShuffledJoin if j.isSkewJoin => j - }.size - SkewJoinAwareCost(shuffleNumber, skewJoinNumber) + + if (forceOptimizeSkewJoin) { + val skewJoinNumber = plan.collect { + case j: ShuffledJoin if j.isSkewJoin => j + }.size + SkewJoinAwareCost(shuffleNumber, skewJoinNumber) + } else { + SimpleCost(shuffleNumber) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index d3726ed31f76e..6977007943d4c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1786,7 +1786,6 @@ class AdaptiveQueryExecSuite } } -<<<<<<< HEAD test("SPARK-35650: Coalesce number of partitions by AEQ") { withSQLConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1") { Seq("REPARTITION", "REBALANCE(key)") @@ -1890,6 +1889,7 @@ class AdaptiveQueryExecSuite test("SPARK-33832: Support optimize skew join even if introduce extra shuffle") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_OPTIMIZE_SKEWS_IN_REBALANCE_PARTITIONS_ENABLED.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100", SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100", From 8bc22ad61f2a76da6a3430f5eebe9646cf5fb841 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 9 Jul 2021 17:06:17 +0800 Subject: [PATCH 06/37] nit --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 11 ++++++----- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 3 ++- .../execution/adaptive/AdaptiveQueryExecSuite.scala | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 35f7f116a51e5..6e3f4909d8b6c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -678,11 +678,12 @@ object SQLConf { .booleanConf .createWithDefault(true) - val ADAPTIVE_FORCE_ENABLE_SKEW_JOIN = buildConf("spark.sql.adaptive.forceEnableSkewJoin") - .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.") - .version("3.2.0") - .booleanConf - .createWithDefault(false) + val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN = + buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin") + .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.") + .version("3.2.0") + .booleanConf + .createWithDefault(false) val ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS = buildConf("spark.sql.adaptive.customCostEvaluatorClass") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index c21e35ca4674c..b975456b8465e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -155,7 +155,8 @@ case class AdaptiveSparkPlanExec( @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) - case _ => SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) + case _ => + SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) } @transient val initialPlan = context.session.withActive { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index e755c848100d7..5d4cd268816e9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1894,7 +1894,7 @@ class AdaptiveQueryExecSuite SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100", SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1", SQLConf.SHUFFLE_PARTITIONS.key -> "10", - SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN.key -> "true") { + SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN.key -> "true") { withTempView("skewData1", "skewData2") { spark .range(0, 1000, 1, 10) From 7734d3eec1078a56f600ce7b6e718824c09a0e3b Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 9 Jul 2021 17:38:23 +0800 Subject: [PATCH 07/37] nit --- .../adaptive/AdaptiveSparkPlanExec.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index b975456b8465e..4324eefbe8b64 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -281,25 +281,25 @@ case class AdaptiveSparkPlanExec( // plans are updated, we can clear the query stage list because at this point the two plans // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) - val (reOptimizePhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) - val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizePhysicalPlan) + val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) + val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizationPhysicalPlan) val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) - val newCost = costEvaluator.evaluateCost(reOptimizePhysicalPlan) + val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan) val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle) def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = { - logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan") + logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan stagesToReplace = Seq.empty[QueryStageExec] } - if (extraShuffleCost < newCost || - (extraShuffleCost == newCost && planWithExtraShuffle != reOptimizePhysicalPlan)) { + if (extraShuffleCost < reOptimizationCost || (extraShuffleCost == reOptimizationCost && + reOptimizationPhysicalPlan != planWithExtraShuffle)) { updateCurrentPlan(planWithExtraShuffle) - } else if (newCost < origCost || - (newCost == origCost && currentPhysicalPlan != reOptimizePhysicalPlan)) { - updateCurrentPlan(reOptimizePhysicalPlan) + } else if (reOptimizationCost < origCost || + (reOptimizationCost == origCost && currentPhysicalPlan != reOptimizationPhysicalPlan)) { + updateCurrentPlan(reOptimizationPhysicalPlan) } // Now that some stages have finished, we can try creating new stages. result = createQueryStages(currentPhysicalPlan) From 3dc61a3f6ebb784aac7e739bed34f0028f77c052 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 3 Aug 2021 19:38:46 +0800 Subject: [PATCH 08/37] force optimize skewed join --- .../adaptive/AdaptiveSparkPlanExec.scala | 62 ++----------------- .../adaptive/SkewJoinAwareCost.scala | 4 +- 2 files changed, 6 insertions(+), 60 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index da003b24a9a89..c719e6c4a6524 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -116,9 +116,9 @@ case class AdaptiveSparkPlanExec( // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of // skew join and extra shuffle nodes. OptimizeSkewedJoin, - // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the - // output partitioning - EnsureRequirements() + // Add the EnsureRequirements rule here and don't optimize out repartition so that we can + // ensure the output partitioning of OptimizeSkewedJoin is always expected. + EnsureRequirements(false) ) // A list of physical optimizer rules to be applied to a new stage before its execution. These @@ -140,36 +140,6 @@ case class AdaptiveSparkPlanExec( CollapseCodegenStages() ) - // OptimizeSkewedJoin has moved into this rules, so we should follow the finalStageOptimizerRules - // for the final stage. - private def finalStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = { - val origins = inputPlan.collect { - case s: ShuffleExchangeLike => s.shuffleOrigin - } - queryStagePreparationWithExtraShuffleRules.filter { - case c: AQEShuffleReadRule => - origins.forall(c.supportedShuffleOrigins.contains) - case _ => true - } - } - - // The partitioning of the query output depends on the shuffle(s) in the final stage. If the - // original plan contains a repartition operator, we need to preserve the specified partitioning, - // whether or not the repartition-introduced shuffle is optimized out because of an underlying - // shuffle of the same partitioning. Thus, we need to exclude some `CustomShuffleReaderRule`s - // from the final stage, depending on the presence and properties of repartition operators. - private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] = { - val origins = inputPlan.collect { - case s: ShuffleExchangeLike => s.shuffleOrigin - } - val allRules = queryStageOptimizerRules ++ postStageCreationRules - allRules.filter { - case c: CustomShuffleReaderRule => - origins.forall(c.supportedShuffleOrigins.contains) - case _ => true - } - } ++ context.session.sessionState.postStageCreationRules - private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => val applied = rule.apply(latestPlan) @@ -698,25 +668,6 @@ case class AdaptiveSparkPlanExec( logicalPlan } - private def isFinalStage(sparkPlan: SparkPlan): Boolean = { - sparkPlan match { - // avoid top level node is Exchange - case _: Exchange => false - case plan => - // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage - // and all query stages are materialized. - plan.find { - case p if p.children.exists( - child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => - p match { - case stage: QueryStageExec if stage.isMaterialized => false - case _ => true - } - case _ => false - }.isEmpty - } - } - /** * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ @@ -746,14 +697,9 @@ case class AdaptiveSparkPlanExec( } private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = { - val preparationWithExtraShuffleRules = if (isFinalStage(sparkPlan)) { - finalStagePreparationWithExtraShuffleRules - } else { - queryStagePreparationWithExtraShuffleRules - } applyPhysicalRules( sparkPlan, - preparationWithExtraShuffleRules, + queryStagePreparationWithExtraShuffleRules, Some((planChangeLogger, "AQE Replanning"))) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala index c2e9642171588..892b8a9749677 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala @@ -48,13 +48,13 @@ case class SkewJoinAwareCost( * A skew join aware implementation of [[CostEvaluator]], which counts the number of * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan. */ -case class SkewJoinAwareCostEvaluator(forceOptimizeSkewJoin: Boolean) extends CostEvaluator { +case class SkewJoinAwareCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator { override def evaluateCost(plan: SparkPlan): Cost = { val shuffleNumber = plan.collect { case s: ShuffleExchangeLike => s }.size - if (forceOptimizeSkewJoin) { + if (forceOptimizeSkewedJoin) { val skewJoinNumber = plan.collect { case j: ShuffledJoin if j.isSkewJoin => j }.size From 30b7de03dc20ab8f2d9aaa07ed37a18d6081f672 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 3 Aug 2021 19:41:53 +0800 Subject: [PATCH 09/37] style --- .../apache/spark/sql/internal/SQLConf.scala | 2 +- .../adaptive/AdaptiveSparkPlanExec.scala | 2 +- .../adaptive/OptimizeSkewedJoin.scala | 20 +++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 71d5928c24a74..70097ab1518d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -680,7 +680,7 @@ object SQLConf { val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN = buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin") - .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.") + .doc("When true, force enable OptimizeSkewedJoin even if it introduces extra shuffle.") .version("3.2.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index c719e6c4a6524..cc63000c88ce4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -138,7 +138,7 @@ case class AdaptiveSparkPlanExec( @transient private val postStageCreationRules = Seq( ApplyColumnarRulesAndInsertTransitions(context.session.sessionState.columnarRules), CollapseCodegenStages() - ) + ) ++ context.session.sessionState.postStageCreationRules private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 2c423c8a4189a..dcdc67360f98c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -114,9 +114,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule { * 3 tasks separately. */ private def tryOptimizeJoinChildren( - left: ShuffleQueryStageExec, - right: ShuffleQueryStageExec, - joinType: JoinType): Option[(SparkPlan, SparkPlan)] = { + left: ShuffleQueryStageExec, + right: ShuffleQueryStageExec, + joinType: JoinType): Option[(SparkPlan, SparkPlan)] = { val canSplitLeft = canSplitLeftSide(joinType) val canSplitRight = canSplitRightSide(joinType) if (!canSplitLeft && !canSplitRight) return None @@ -202,9 +202,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule { } def optimizeSkewJoin(plan: SparkPlan): SparkPlan = plan.transformUp { - case smj@SortMergeJoinExec(_, _, joinType, _, - s1@SortExec(_, _, ShuffleStage(left: ShuffleQueryStageExec), _), - s2@SortExec(_, _, ShuffleStage(right: ShuffleQueryStageExec), _), false) => + case smj @ SortMergeJoinExec(_, _, joinType, _, + s1 @ SortExec(_, _, ShuffleStage(left: ShuffleQueryStageExec), _), + s2 @ SortExec(_, _, ShuffleStage(right: ShuffleQueryStageExec), _), false) => val newChildren = tryOptimizeJoinChildren(left, right, joinType) if (newChildren.isDefined) { val (newLeft, newRight) = newChildren.get @@ -214,9 +214,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule { smj } - case shj@ShuffledHashJoinExec(_, _, joinType, _, _, - ShuffleStage(left: ShuffleQueryStageExec), - ShuffleStage(right: ShuffleQueryStageExec), false) => + case shj @ ShuffledHashJoinExec(_, _, joinType, _, _, + ShuffleStage(left: ShuffleQueryStageExec), + ShuffleStage(right: ShuffleQueryStageExec), false) => val newChildren = tryOptimizeJoinChildren(left, right, joinType) if (newChildren.isDefined) { val (newLeft, newRight) = newChildren.get @@ -256,7 +256,7 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule { } } - private object ShuffleStage { + object ShuffleStage { def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match { case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined && isSupported(s.shuffle) => From 6caa4a301d3836d34bfc99f02fcb7decb2ce82aa Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 3 Aug 2021 20:13:45 +0800 Subject: [PATCH 10/37] name --- .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index a414e6a44129f..fe4a2f3e82408 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1925,14 +1925,14 @@ class AdaptiveQueryExecSuite "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1") val smj = findTopLevelSortMergeJoin(adaptive) assert(smj.size == 1 && smj.forall(_.isSkewJoin)) - checkNumLocalShuffleReaders(adaptive, 3) + checkNumLocalShuffleReads(adaptive, 3) val (_, adaptive2) = runAdaptiveAndVerifyResult( "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") val smj2 = findTopLevelSortMergeJoin(adaptive2) assert(smj2.size == 1 && smj2.forall(_.isSkewJoin)) // top level shuffle reader is local - checkNumLocalShuffleReaders(adaptive2, 2) + checkNumLocalShuffleReads(adaptive2, 2) } } } From cd1a37992e3fd80d3ec6ee808a41dd2fa6233350 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 13 Aug 2021 11:55:18 +0800 Subject: [PATCH 11/37] final stage --- .../adaptive/AdaptiveSparkPlanExec.scala | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index cc63000c88ce4..b2d87f1d5c5de 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} +import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SQLPlanMetric, SparkListenerSQLAdaptiveSQLMetricUpdates} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.ThreadUtils @@ -118,7 +118,7 @@ case class AdaptiveSparkPlanExec( OptimizeSkewedJoin, // Add the EnsureRequirements rule here and don't optimize out repartition so that we can // ensure the output partitioning of OptimizeSkewedJoin is always expected. - EnsureRequirements(false) + EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined) ) // A list of physical optimizer rules to be applied to a new stage before its execution. These @@ -140,8 +140,9 @@ case class AdaptiveSparkPlanExec( CollapseCodegenStages() ) ++ context.session.sessionState.postStageCreationRules - private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { - val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => + private def optimizeStage( + rules: Seq[Rule[SparkPlan]], plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { + val optimized = rules.foldLeft(plan) { case (latestPlan, rule) => val applied = rule.apply(latestPlan) val result = rule match { case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) => @@ -169,6 +170,10 @@ case class AdaptiveSparkPlanExec( optimized } + private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { + optimizeStage(queryStageOptimizerRules, plan, isFinalStage) + } + @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) @@ -696,11 +701,30 @@ case class AdaptiveSparkPlanExec( (finalPlan, optimized) } + private def isFinalStage(sparkPlan: SparkPlan): Boolean = { + sparkPlan match { + // avoid top level node is Exchange + case _: Exchange => false + case plan => + // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage + // and all query stages are materialized. + plan.find { + case p if p.children.exists( + child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => + p match { + case stage: QueryStageExec if stage.isMaterialized => false + case _ => true + } + case _ => false + }.isEmpty + } + } + private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = { - applyPhysicalRules( - sparkPlan, + optimizeStage( queryStagePreparationWithExtraShuffleRules, - Some((planChangeLogger, "AQE Replanning"))) + sparkPlan, + isFinalStage(sparkPlan)) } /** From 2b3bfe6d04456b0eb7df083964067dec5539ac38 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 13 Aug 2021 11:56:12 +0800 Subject: [PATCH 12/37] style --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index b2d87f1d5c5de..d888a20973ebc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SQLPlanMetric, SparkListenerSQLAdaptiveSQLMetricUpdates} +import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.ThreadUtils From 6725f975ca9f15eb8f233381fcfa9fc88cdef760 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 13 Aug 2021 14:35:01 +0800 Subject: [PATCH 13/37] checkDistribution --- .../adaptive/AdaptiveSparkPlanExec.scala | 59 +++++++++++-------- .../adaptive/AdaptiveQueryExecSuite.scala | 5 ++ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 6b41529c8a394..3ccaa8749cabd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -113,7 +113,7 @@ case class AdaptiveSparkPlanExec( // This list rules are applied between queryStagePreparationRules and estimate physical plan cost // so that we can support introduce extra shuffle - @transient private val queryStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq( + @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq( // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of // skew join and extra shuffle nodes. OptimizeSkewedJoin, @@ -147,27 +147,14 @@ case class AdaptiveSparkPlanExec( collapseCodegenStagesRule ) - private def optimizeStage( - rules: Seq[Rule[SparkPlan]], plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { - val optimized = rules.foldLeft(plan) { case (latestPlan, rule) => + private def optimizeQueryStage( + plan: SparkPlan, + isFinalStage: Boolean): SparkPlan = context.qe.withCteMap { + val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => val applied = rule.apply(latestPlan) val result = rule match { case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) => - val distribution = if (isFinalStage) { - // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize - // out the user-specified repartition, thus we don't have a distribution requirement - // for the final plan. - requiredDistribution.getOrElse(UnspecifiedDistribution) - } else { - UnspecifiedDistribution - } - if (ValidateRequirements.validate(applied, distribution)) { - applied - } else { - logDebug(s"Rule ${rule.ruleName} is not applied as it breaks the " + - "distribution requirement of the query plan.") - latestPlan - } + checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName) case _ => applied } planChangeLogger.logRule(rule.ruleName, latestPlan, result) @@ -177,8 +164,26 @@ case class AdaptiveSparkPlanExec( optimized } - private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = { - optimizeStage(queryStageOptimizerRules, plan, isFinalStage) + private def checkDistribution( + newPlan: SparkPlan, + originPlan: SparkPlan, + isFinalStage: Boolean, + ruleName: String): SparkPlan = { + val distribution = if (isFinalStage) { + // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize + // out the user-specified repartition, thus we don't have a distribution requirement + // for the final plan. + requiredDistribution.getOrElse(UnspecifiedDistribution) + } else { + UnspecifiedDistribution + } + if (ValidateRequirements.validate(newPlan, distribution)) { + newPlan + } else { + logDebug(s"Rule $ruleName is not applied as it breaks the " + + "distribution requirement of the query plan.") + originPlan + } } @transient private val costEvaluator = @@ -311,7 +316,7 @@ case class AdaptiveSparkPlanExec( // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) - val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizationPhysicalPlan) + val planWithExtraShuffle = optimizeSkewedJoin(reOptimizationPhysicalPlan) val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan) val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle) @@ -706,11 +711,13 @@ case class AdaptiveSparkPlanExec( } } - private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = { - optimizeStage( - queryStagePreparationWithExtraShuffleRules, + private def optimizeSkewedJoin(sparkPlan: SparkPlan): SparkPlan = { + val optimized = applyPhysicalRules( sparkPlan, - isFinalStage(sparkPlan)) + optimizeSkewedJoinWithExtraShuffleRules, + Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle")) + ) + checkDistribution(optimized, sparkPlan, isFinalStage(optimized), OptimizeSkewedJoin.ruleName) } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 8e838542ddd74..438529aad0e6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1940,6 +1940,11 @@ class AdaptiveQueryExecSuite assert(smj2.size == 1 && smj2.forall(_.isSkewJoin)) // top level shuffle reader is local checkNumLocalShuffleReads(adaptive2, 2) + + val (_, adaptive3) = runAdaptiveAndVerifyResult( + "SELECT /*+ repartition(key1) */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") + val smj3 = findTopLevelSortMergeJoin(adaptive3) + assert(smj3.size == 1 && !smj3.exists(_.isSkewJoin)) } } } From 7a0448b0a80b52bf226f219bea515a5d0f7e96c4 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 19 Aug 2021 19:17:00 +0800 Subject: [PATCH 14/37] SimpleCostEvaluator --- .../apache/spark/sql/internal/SQLConf.scala | 2 +- .../adaptive/AdaptiveSparkPlanExec.scala | 2 +- .../adaptive/SkewJoinAwareCost.scala | 66 ------------------- .../execution/adaptive/simpleCosting.scala | 40 +++++++++-- 4 files changed, 37 insertions(+), 73 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9c84add520249..0020faed79ceb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -680,7 +680,7 @@ object SQLConf { val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN = buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin") .doc("When true, force enable OptimizeSkewedJoin even if it introduces extra shuffle.") - .version("3.2.0") + .version("3.3.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 3ccaa8749cabd..ffac595d685ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -190,7 +190,7 @@ case class AdaptiveSparkPlanExec( conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) case _ => - SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) + SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) } @transient val initialPlan = context.session.withActive { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala deleted file mode 100644 index 892b8a9749677..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.adaptive - -import org.apache.spark.sql.errors.QueryExecutionErrors -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike -import org.apache.spark.sql.execution.joins.ShuffledJoin - -/** - * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number - */ -case class SkewJoinAwareCost( - numShuffles: Int, - numSkewJoins: Int) extends Cost { - override def compare(that: Cost): Int = that match { - case other: SkewJoinAwareCost => - if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) { - // If more skew joins are optimized or less shuffle nodes, it means the cost is lower - -1 - } else if (numShuffles > other.numShuffles) { - 1 - } else { - 0 - } - - case _ => - throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString) - } -} - -/** - * A skew join aware implementation of [[CostEvaluator]], which counts the number of - * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan. - */ -case class SkewJoinAwareCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator { - override def evaluateCost(plan: SparkPlan): Cost = { - val shuffleNumber = plan.collect { - case s: ShuffleExchangeLike => s - }.size - - if (forceOptimizeSkewedJoin) { - val skewJoinNumber = plan.collect { - case j: ShuffledJoin if j.isSkewJoin => j - }.size - SkewJoinAwareCost(shuffleNumber, skewJoinNumber) - } else { - SimpleCost(shuffleNumber) - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index 7f026835dcf48..8b26d84f43ea8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike +import org.apache.spark.sql.execution.joins.ShuffledJoin /** * A simple implementation of [[Cost]], which takes a number of [[Long]] as the cost value. @@ -35,15 +36,44 @@ case class SimpleCost(value: Long) extends Cost { } /** - * A simple implementation of [[CostEvaluator]], which counts the number of - * [[ShuffleExchangeLike]] nodes in the plan. + * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number */ -object SimpleCostEvaluator extends CostEvaluator { +case class SkewJoinAwareCost( + numShuffles: Int, + numSkewJoins: Int) extends Cost { + override def compare(that: Cost): Int = that match { + case other: SkewJoinAwareCost => + if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) { + // If more skew joins are optimized or less shuffle nodes, it means the cost is lower + -1 + } else if (numShuffles > other.numShuffles) { + 1 + } else { + 0 + } + case _ => + throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString) + } +} + +/** + * A skew join aware implementation of [[CostEvaluator]], which counts the number of + * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan. + */ +case class SimpleCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator { override def evaluateCost(plan: SparkPlan): Cost = { - val cost = plan.collect { + val shuffleNumber = plan.collect { case s: ShuffleExchangeLike => s }.size - SimpleCost(cost) + + if (forceOptimizeSkewedJoin) { + val skewJoinNumber = plan.collect { + case j: ShuffledJoin if j.isSkewJoin => j + }.size + SkewJoinAwareCost(shuffleNumber, skewJoinNumber) + } else { + SimpleCost(shuffleNumber) + } } } From 60b7b9da8f268f15706c30462976ffc389c95c45 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 19 Aug 2021 20:44:11 +0800 Subject: [PATCH 15/37] address comment --- .../adaptive/AdaptiveSparkPlanExec.scala | 97 +++++++++++-------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index ffac595d685ce..d77e90074b4f7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -80,6 +80,9 @@ case class AdaptiveSparkPlanExec( case _ => logDebug(_) } + @transient private val forceOptimizeSkewedJoin = + conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN) + @transient private val planChangeLogger = new PlanChangeLogger[SparkPlan]() // The logical plan optimizer for re-optimizing the current logical plan. @@ -113,14 +116,16 @@ case class AdaptiveSparkPlanExec( // This list rules are applied between queryStagePreparationRules and estimate physical plan cost // so that we can support introduce extra shuffle - @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq( - // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of - // skew join and extra shuffle nodes. - OptimizeSkewedJoin, - // Add the EnsureRequirements rule here and don't optimize out repartition so that we can - // ensure the output partitioning of OptimizeSkewedJoin is always expected. - EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined) - ) + @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = { + val ensureExtraShuffleRule = if (forceOptimizeSkewedJoin) { + // Add the EnsureRequirements rule here and don't optimize out repartition so that we can + // ensure the output partitioning of OptimizeSkewedJoin is always expected. + Seq(EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)) + } else { + Nil + } + Seq(OptimizeSkewedJoin) ++ ensureExtraShuffleRule + } // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. @@ -153,7 +158,7 @@ case class AdaptiveSparkPlanExec( val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => val applied = rule.apply(latestPlan) val result = rule match { - case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) => + case _: AQEShuffleReadRule => checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName) case _ => applied } @@ -169,6 +174,7 @@ case class AdaptiveSparkPlanExec( originPlan: SparkPlan, isFinalStage: Boolean, ruleName: String): SparkPlan = { + if (newPlan.fastEquals(originPlan)) return originPlan val distribution = if (isFinalStage) { // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize // out the user-specified repartition, thus we don't have a distribution requirement @@ -189,8 +195,7 @@ case class AdaptiveSparkPlanExec( @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) - case _ => - SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) + case _ => SimpleCostEvaluator(forceOptimizeSkewedJoin) } @transient val initialPlan = context.session.withActive { @@ -315,26 +320,25 @@ case class AdaptiveSparkPlanExec( // plans are updated, we can clear the query stage list because at this point the two plans // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) - val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) - val planWithExtraShuffle = optimizeSkewedJoin(reOptimizationPhysicalPlan) - val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) - val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan) - val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle) - def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = { + val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan) + val newPhysicalPlan = + (Seq(currentPhysicalPlan) ++ newPhysicalPlans) + .map(plan => (plan, costEvaluator.evaluateCost(plan))) + .reduce { (last, current) => + if (current._2 < last._2 || (current._2 == last._2 && current._1 != last._1)) { + current + } else { + last + } + }._1 + + if (!newPhysicalPlan.fastEquals(currentPhysicalPlan)) { logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan stagesToReplace = Seq.empty[QueryStageExec] } - - if (extraShuffleCost < reOptimizationCost || (extraShuffleCost == reOptimizationCost && - reOptimizationPhysicalPlan != planWithExtraShuffle)) { - updateCurrentPlan(planWithExtraShuffle) - } else if (reOptimizationCost < origCost || - (reOptimizationCost == origCost && currentPhysicalPlan != reOptimizationPhysicalPlan)) { - updateCurrentPlan(reOptimizationPhysicalPlan) - } // Now that some stages have finished, we can try creating new stages. result = createQueryStages(currentPhysicalPlan) } @@ -667,11 +671,11 @@ case class AdaptiveSparkPlanExec( * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ private def reOptimize( - logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = context.qe.withCteMap { + logicalPlan: LogicalPlan): (Seq[SparkPlan], LogicalPlan) = context.qe.withCteMap { logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - val newPlan = applyPhysicalRules( + val optimizedPhysicalPlan = applyPhysicalRules( sparkPlan, preprocessingRules ++ queryStagePreparationRules, Some((planChangeLogger, "AQE Replanning"))) @@ -683,13 +687,35 @@ case class AdaptiveSparkPlanExec( // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery. // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule. - val finalPlan = currentPhysicalPlan match { + def updateBroadcastExchange(plan: SparkPlan): SparkPlan = currentPhysicalPlan match { case b: BroadcastExchangeLike - if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan)) - case _ => newPlan + if (!plan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(plan)) + case _ => plan } - (finalPlan, optimized) + val optimizedWithSkewedJoin = applyPhysicalRules( + optimizedPhysicalPlan, + optimizeSkewedJoinWithExtraShuffleRules, + Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle")) + ) + val validatedWithSkewedJoin = + checkDistribution( + optimizedWithSkewedJoin, + optimizedPhysicalPlan, + isFinalStage(optimizedWithSkewedJoin), + OptimizeSkewedJoin.ruleName) + + // here are three reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan: + // 1. no skewed join optimized + // 2. optimize skewed join introduce extra shuffle and force optimize is disabled + // 3. optimize skewed join change final stage output partitioning + val newPhysicalPlans = if (validatedWithSkewedJoin.fastEquals(optimizedPhysicalPlan)) { + updateBroadcastExchange(optimizedPhysicalPlan) :: Nil + } else { + updateBroadcastExchange(optimizedPhysicalPlan) :: + updateBroadcastExchange(validatedWithSkewedJoin) :: Nil + } + (newPhysicalPlans, optimized) } private def isFinalStage(sparkPlan: SparkPlan): Boolean = { @@ -711,15 +737,6 @@ case class AdaptiveSparkPlanExec( } } - private def optimizeSkewedJoin(sparkPlan: SparkPlan): SparkPlan = { - val optimized = applyPhysicalRules( - sparkPlan, - optimizeSkewedJoinWithExtraShuffleRules, - Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle")) - ) - checkDistribution(optimized, sparkPlan, isFinalStage(optimized), OptimizeSkewedJoin.ruleName) - } - /** * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node. */ From fbf9727f9f1add7447005e1c4bc41697cc512e1b Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 19 Aug 2021 21:52:58 +0800 Subject: [PATCH 16/37] cost --- .../spark/sql/execution/adaptive/simpleCosting.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index 8b26d84f43ea8..621fb01832ccd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -43,8 +43,12 @@ case class SkewJoinAwareCost( numSkewJoins: Int) extends Cost { override def compare(that: Cost): Int = that match { case other: SkewJoinAwareCost => - if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) { - // If more skew joins are optimized or less shuffle nodes, it means the cost is lower + // If more skew joins are optimized or less shuffle nodes, it means the cost is lower + if (numSkewJoins > other.numSkewJoins) { + -1 + } else if (numSkewJoins < other.numSkewJoins) { + 1 + } else if (numShuffles < other.numShuffles) { -1 } else if (numShuffles > other.numShuffles) { 1 From b54e9c232a43ce05e320d5459b16b770261ee38b Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 20 Aug 2021 12:26:02 +0800 Subject: [PATCH 17/37] plan twice --- .../adaptive/AdaptiveSparkPlanExec.scala | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index d77e90074b4f7..2e6585bd5b818 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -103,28 +103,27 @@ case class AdaptiveSparkPlanExec( // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. - @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( - RemoveRedundantProjects, - // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for - // the final plan, but we do need to respect the user-specified repartition. Here we ask - // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work - // around this case. - EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), - RemoveRedundantSorts, - DisableUnnecessaryBucketedScan - ) ++ context.session.sessionState.queryStagePrepRules - - // This list rules are applied between queryStagePreparationRules and estimate physical plan cost - // so that we can support introduce extra shuffle - @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = { - val ensureExtraShuffleRule = if (forceOptimizeSkewedJoin) { - // Add the EnsureRequirements rule here and don't optimize out repartition so that we can - // ensure the output partitioning of OptimizeSkewedJoin is always expected. - Seq(EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)) + private def queryStagePreparationRules( + optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = { + val optimizeSkewedJoinRules = if (optimizeSkewedJoin) { + Seq(OptimizeSkewedJoin, + // Add the EnsureRequirements rule here and don't optimize out repartition so that we can + // ensure the output partitioning of OptimizeSkewedJoin is always expected. + EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)) } else { Nil } - Seq(OptimizeSkewedJoin) ++ ensureExtraShuffleRule + + Seq( + RemoveRedundantProjects, + // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for + // the final plan, but we do need to respect the user-specified repartition. Here we ask + // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work + // around this case. + EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), + RemoveRedundantSorts, + DisableUnnecessaryBucketedScan + ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules } // A list of physical optimizer rules to be applied to a new stage before its execution. These @@ -200,7 +199,7 @@ case class AdaptiveSparkPlanExec( @transient val initialPlan = context.session.withActive { applyPhysicalRules( - inputPlan, queryStagePreparationRules, Some((planChangeLogger, "AQE Preparations"))) + inputPlan, queryStagePreparationRules(), Some((planChangeLogger, "AQE Preparations"))) } @volatile private var currentPhysicalPlan = initialPlan @@ -332,7 +331,7 @@ case class AdaptiveSparkPlanExec( } }._1 - if (!newPhysicalPlan.fastEquals(currentPhysicalPlan)) { + if (newPhysicalPlan.ne(currentPhysicalPlan)) { logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan @@ -667,6 +666,25 @@ case class AdaptiveSparkPlanExec( logicalPlan } + private def isFinalStage(sparkPlan: SparkPlan): Boolean = { + sparkPlan match { + // avoid top level node is Exchange + case _: Exchange => false + case plan => + // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage + // and all query stages are materialized. + plan.find { + case p if p.children.exists( + child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => + p match { + case stage: QueryStageExec if stage.isMaterialized => false + case _ => true + } + case _ => false + }.isEmpty + } + } + /** * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ @@ -675,11 +693,25 @@ case class AdaptiveSparkPlanExec( logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() + val optimizedPhysicalPlan = applyPhysicalRules( sparkPlan, - preprocessingRules ++ queryStagePreparationRules, + preprocessingRules ++ queryStagePreparationRules(), Some((planChangeLogger, "AQE Replanning"))) + val optimizedWithSkewedJoin = applyPhysicalRules( + sparkPlan, + preprocessingRules ++ queryStagePreparationRules(true), + Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))) + + // respect the requiredDistribution for final stage + val validatedWithSkewedJoin = + checkDistribution( + optimizedWithSkewedJoin, + optimizedPhysicalPlan, + isFinalStage(optimizedWithSkewedJoin), + OptimizeSkewedJoin.ruleName) + // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated @@ -693,23 +725,10 @@ case class AdaptiveSparkPlanExec( case _ => plan } - val optimizedWithSkewedJoin = applyPhysicalRules( - optimizedPhysicalPlan, - optimizeSkewedJoinWithExtraShuffleRules, - Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle")) - ) - val validatedWithSkewedJoin = - checkDistribution( - optimizedWithSkewedJoin, - optimizedPhysicalPlan, - isFinalStage(optimizedWithSkewedJoin), - OptimizeSkewedJoin.ruleName) - - // here are three reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan: + // here are two reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan: // 1. no skewed join optimized - // 2. optimize skewed join introduce extra shuffle and force optimize is disabled - // 3. optimize skewed join change final stage output partitioning - val newPhysicalPlans = if (validatedWithSkewedJoin.fastEquals(optimizedPhysicalPlan)) { + // 2. optimize skewed join doesn't satisfy requiredDistribution for final stage + val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(validatedWithSkewedJoin)) { updateBroadcastExchange(optimizedPhysicalPlan) :: Nil } else { updateBroadcastExchange(optimizedPhysicalPlan) :: @@ -718,25 +737,6 @@ case class AdaptiveSparkPlanExec( (newPhysicalPlans, optimized) } - private def isFinalStage(sparkPlan: SparkPlan): Boolean = { - sparkPlan match { - // avoid top level node is Exchange - case _: Exchange => false - case plan => - // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage - // and all query stages are materialized. - plan.find { - case p if p.children.exists( - child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => - p match { - case stage: QueryStageExec if stage.isMaterialized => false - case _ => true - } - case _ => false - }.isEmpty - } - } - /** * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node. */ From f5ad40e7ef8fafc4e084ef15d7956d0b3eaf214e Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 20 Aug 2021 12:36:07 +0800 Subject: [PATCH 18/37] nit --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 2e6585bd5b818..1d51950fdff0a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -80,9 +80,6 @@ case class AdaptiveSparkPlanExec( case _ => logDebug(_) } - @transient private val forceOptimizeSkewedJoin = - conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN) - @transient private val planChangeLogger = new PlanChangeLogger[SparkPlan]() // The logical plan optimizer for re-optimizing the current logical plan. @@ -194,7 +191,7 @@ case class AdaptiveSparkPlanExec( @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) - case _ => SimpleCostEvaluator(forceOptimizeSkewedJoin) + case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) } @transient val initialPlan = context.session.withActive { From 8058fe9354ed28b32f0ea0bdc8c9243f79c58fba Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Fri, 20 Aug 2021 12:41:20 +0800 Subject: [PATCH 19/37] nit --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 1d51950fdff0a..72f88310de81f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -104,8 +104,8 @@ case class AdaptiveSparkPlanExec( optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = { val optimizeSkewedJoinRules = if (optimizeSkewedJoin) { Seq(OptimizeSkewedJoin, - // Add the EnsureRequirements rule here and don't optimize out repartition so that we can - // ensure the output partitioning of OptimizeSkewedJoin is always expected. + // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change + // output partitioning, make sure we have right distribution. EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)) } else { Nil From 369bf33c27e940cced13e72feb54fc40f945f20e Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 11:13:31 +0800 Subject: [PATCH 20/37] ensureRequiredDistribution --- .../sql/execution/adaptive/AQEUtils.scala | 19 +++++- .../adaptive/AdaptiveSparkPlanExec.scala | 49 ++++++--------- .../adaptive/AdaptiveQueryExecSuite.scala | 62 ++++++++++++++----- 3 files changed, 81 insertions(+), 49 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala index 277af212d81f3..e2d19e8fed730 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala @@ -17,11 +17,12 @@ package org.apache.spark.sql.execution.adaptive +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec} -object AQEUtils { +object AQEUtils extends SQLConfHelper { // Analyze the given plan and calculate the required distribution of this plan w.r.t. the // user-specified repartition. @@ -57,4 +58,20 @@ object AQEUtils { } case _ => Some(UnspecifiedDistribution) } + + // Add an extra shuffle if input plan does not satisfy the required distribution. + // This method is invoked after optimizing skewed join in case we change final stage + // output partitioning. + def ensureRequiredDistribution( + plan: SparkPlan, distribution: Option[Distribution]): SparkPlan = distribution match { + case Some(d) if !plan.outputPartitioning.satisfies(d) => + val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions) + val shuffleOrigin = if (d.requiredNumPartitions.isDefined) { + REPARTITION_BY_NUM + } else { + REPARTITION_BY_COL + } + ShuffleExchangeExec(d.createPartitioning(numPartitions), plan, shuffleOrigin) + case _ => plan + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 72f88310de81f..1831e73fd085d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -154,8 +154,22 @@ case class AdaptiveSparkPlanExec( val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) => val applied = rule.apply(latestPlan) val result = rule match { - case _: AQEShuffleReadRule => - checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName) + case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) => + val distribution = if (isFinalStage) { + // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize + // out the user-specified repartition, thus we don't have a distribution requirement + // for the final plan. + requiredDistribution.getOrElse(UnspecifiedDistribution) + } else { + UnspecifiedDistribution + } + if (ValidateRequirements.validate(applied, distribution)) { + applied + } else { + logDebug(s"Rule ${rule.ruleName} is not applied as it breaks the " + + "distribution requirement of the query plan.") + latestPlan + } case _ => applied } planChangeLogger.logRule(rule.ruleName, latestPlan, result) @@ -165,29 +179,6 @@ case class AdaptiveSparkPlanExec( optimized } - private def checkDistribution( - newPlan: SparkPlan, - originPlan: SparkPlan, - isFinalStage: Boolean, - ruleName: String): SparkPlan = { - if (newPlan.fastEquals(originPlan)) return originPlan - val distribution = if (isFinalStage) { - // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize - // out the user-specified repartition, thus we don't have a distribution requirement - // for the final plan. - requiredDistribution.getOrElse(UnspecifiedDistribution) - } else { - UnspecifiedDistribution - } - if (ValidateRequirements.validate(newPlan, distribution)) { - newPlan - } else { - logDebug(s"Rule $ruleName is not applied as it breaks the " + - "distribution requirement of the query plan.") - originPlan - } - } - @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) @@ -701,13 +692,9 @@ case class AdaptiveSparkPlanExec( preprocessingRules ++ queryStagePreparationRules(true), Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))) - // respect the requiredDistribution for final stage + // ensure the output partitioning for requiredDistribution val validatedWithSkewedJoin = - checkDistribution( - optimizedWithSkewedJoin, - optimizedPhysicalPlan, - isFinalStage(optimizedWithSkewedJoin), - OptimizeSkewedJoin.ruleName) + AQEUtils.ensureRequiredDistribution(optimizedWithSkewedJoin, requiredDistribution) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 438529aad0e6f..b50d271e41428 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -96,6 +96,12 @@ class AdaptiveQueryExecSuite (dfAdaptive.queryExecution.sparkPlan, adaptivePlan) } + private def findTopLevelShuffle(plan: SparkPlan): Seq[ShuffleExchangeExec] = { + collect(plan) { + case s: ShuffleExchangeExec => s + } + } + private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = { collect(plan) { case j: BroadcastHashJoinExec => j @@ -1928,23 +1934,45 @@ class AdaptiveQueryExecSuite .selectExpr("id % 1 as key2", "id as value2") .createOrReplaceTempView("skewData2") - val (_, adaptive) = runAdaptiveAndVerifyResult( - "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1") - val smj = findTopLevelSortMergeJoin(adaptive) - assert(smj.size == 1 && smj.forall(_.isSkewJoin)) - checkNumLocalShuffleReads(adaptive, 3) - - val (_, adaptive2) = runAdaptiveAndVerifyResult( - "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") - val smj2 = findTopLevelSortMergeJoin(adaptive2) - assert(smj2.size == 1 && smj2.forall(_.isSkewJoin)) - // top level shuffle reader is local - checkNumLocalShuffleReads(adaptive2, 2) - - val (_, adaptive3) = runAdaptiveAndVerifyResult( - "SELECT /*+ repartition(key1) */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2") - val smj3 = findTopLevelSortMergeJoin(adaptive3) - assert(smj3.size == 1 && !smj3.exists(_.isSkewJoin)) + // check if optimized skewed join does not satisfy the required distribution + Seq(true, false).foreach { hasRequiredDistribution => + Seq(true, false).foreach { hasPartitionNumber => + val repartition = if (hasRequiredDistribution) { + s"/*+ repartition(${ if (hasPartitionNumber) "10," else ""}key1) */" + } else { + "" + } + + // check required distribution and extra shuffle + val (_, adaptive1) = + runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " + + s"JOIN skewData2 ON key1 = key2 GROUP BY key1") + val shuffles1 = findTopLevelShuffle(adaptive1) + assert(shuffles1.size == 3) + assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS) + val smj1 = findTopLevelSortMergeJoin(adaptive1) + assert(smj1.size == 1 && smj1.exists(_.isSkewJoin)) + + // only check required distribution + val (_, adaptive2) = + runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " + + s"JOIN skewData2 ON key1 = key2") + val shuffles2 = findTopLevelShuffle(adaptive2) + if (hasRequiredDistribution) { + assert(shuffles2.size == 3) + val finalShuffle = shuffles2.head + if (hasPartitionNumber) { + assert(finalShuffle.shuffleOrigin == REPARTITION_BY_NUM) + } else { + assert(finalShuffle.shuffleOrigin == REPARTITION_BY_COL) + } + } else { + assert(shuffles2.size == 2) + } + val smj2 = findTopLevelSortMergeJoin(adaptive2) + assert(smj2.size == 1 && smj2.exists(_.isSkewJoin)) + } + } } } } From d93c3df5605f358e76ba3cdd314a05067dd39f92 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 11:16:26 +0800 Subject: [PATCH 21/37] remove dead code --- .../adaptive/AdaptiveSparkPlanExec.scala | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 1831e73fd085d..c34f1d21dab08 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -654,25 +654,6 @@ case class AdaptiveSparkPlanExec( logicalPlan } - private def isFinalStage(sparkPlan: SparkPlan): Boolean = { - sparkPlan match { - // avoid top level node is Exchange - case _: Exchange => false - case plan => - // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage - // and all query stages are materialized. - plan.find { - case p if p.children.exists( - child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) => - p match { - case stage: QueryStageExec if stage.isMaterialized => false - case _ => true - } - case _ => false - }.isEmpty - } - } - /** * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ From b215e2d0fdf959bd1c735297a1a33c1a078f2361 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 11:20:26 +0800 Subject: [PATCH 22/37] simplify code --- .../adaptive/AdaptiveSparkPlanExec.scala | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index c34f1d21dab08..fc8c3f77fc813 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -668,14 +668,13 @@ case class AdaptiveSparkPlanExec( preprocessingRules ++ queryStagePreparationRules(), Some((planChangeLogger, "AQE Replanning"))) - val optimizedWithSkewedJoin = applyPhysicalRules( - sparkPlan, - preprocessingRules ++ queryStagePreparationRules(true), - Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))) - - // ensure the output partitioning for requiredDistribution - val validatedWithSkewedJoin = - AQEUtils.ensureRequiredDistribution(optimizedWithSkewedJoin, requiredDistribution) + val optimizedWithSkewedJoin = + AQEUtils.ensureRequiredDistribution( + applyPhysicalRules( + sparkPlan, + preprocessingRules ++ queryStagePreparationRules(true), + Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))), + requiredDistribution) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, @@ -690,14 +689,11 @@ case class AdaptiveSparkPlanExec( case _ => plan } - // here are two reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan: - // 1. no skewed join optimized - // 2. optimize skewed join doesn't satisfy requiredDistribution for final stage - val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(validatedWithSkewedJoin)) { + val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) { updateBroadcastExchange(optimizedPhysicalPlan) :: Nil } else { updateBroadcastExchange(optimizedPhysicalPlan) :: - updateBroadcastExchange(validatedWithSkewedJoin) :: Nil + updateBroadcastExchange(optimizedWithSkewedJoin) :: Nil } (newPhysicalPlans, optimized) } From 5b63e4db83363d7908b3403a9e40a7d1b2726aa9 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 13:39:18 +0800 Subject: [PATCH 23/37] address comment --- .../adaptive/AdaptiveSparkPlanExec.scala | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index fc8c3f77fc813..145903ba6b099 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -308,21 +308,23 @@ case class AdaptiveSparkPlanExec( // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan) - val newPhysicalPlan = - (Seq(currentPhysicalPlan) ++ newPhysicalPlans) + // We pick the first newPhysicalPlan if have the same cost otherwise pick smaller cost one + val (preferredNewPhysicalPlan, newCost) = + newPhysicalPlans .map(plan => (plan, costEvaluator.evaluateCost(plan))) .reduce { (last, current) => - if (current._2 < last._2 || (current._2 == last._2 && current._1 != last._1)) { + if (current._2 < last._2) { current } else { last } - }._1 - - if (newPhysicalPlan.ne(currentPhysicalPlan)) { - logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan") - cleanUpTempTags(newPhysicalPlan) - currentPhysicalPlan = newPhysicalPlan + } + val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) + if (newCost < origCost || + (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) { + logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan") + cleanUpTempTags(preferredNewPhysicalPlan) + currentPhysicalPlan = preferredNewPhysicalPlan currentLogicalPlan = newLogicalPlan stagesToReplace = Seq.empty[QueryStageExec] } @@ -673,7 +675,7 @@ case class AdaptiveSparkPlanExec( applyPhysicalRules( sparkPlan, preprocessingRules ++ queryStagePreparationRules(true), - Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))), + Some((planChangeLogger, "AQE Replanning"))), requiredDistribution) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will From 3ccc29b695b4343974df2cd2f7330863bfb1b7fc Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 13:50:24 +0800 Subject: [PATCH 24/37] style --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 145903ba6b099..4778ef6128d2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -321,7 +321,7 @@ case class AdaptiveSparkPlanExec( } val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) if (newCost < origCost || - (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) { + (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) { logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan") cleanUpTempTags(preferredNewPhysicalPlan) currentPhysicalPlan = preferredNewPhysicalPlan From bc45d7040d5cd2a8017594605604dadceeea8fb2 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Wed, 25 Aug 2021 16:23:43 +0800 Subject: [PATCH 25/37] fix order --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 4778ef6128d2d..148bb8f28032e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -694,8 +694,8 @@ case class AdaptiveSparkPlanExec( val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) { updateBroadcastExchange(optimizedPhysicalPlan) :: Nil } else { - updateBroadcastExchange(optimizedPhysicalPlan) :: - updateBroadcastExchange(optimizedWithSkewedJoin) :: Nil + updateBroadcastExchange(optimizedWithSkewedJoin) :: + updateBroadcastExchange(optimizedPhysicalPlan) :: Nil } (newPhysicalPlans, optimized) } From 580a0a4548e6c25c8191dbf9de7256045bc86e0c Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 26 Aug 2021 16:11:03 +0800 Subject: [PATCH 26/37] address comment --- .../sql/execution/adaptive/AQEUtils.scala | 8 +++-- .../adaptive/AdaptiveSparkPlanExec.scala | 34 ++++++++++++------- .../adaptive/OptimizeSkewedJoin.scala | 9 +++-- .../execution/adaptive/simpleCosting.scala | 8 ++--- .../adaptive/AdaptiveQueryExecSuite.scala | 5 +-- 5 files changed, 37 insertions(+), 27 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala index e2d19e8fed730..40b46f0100dc7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala @@ -17,12 +17,12 @@ package org.apache.spark.sql.execution.adaptive -import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec} +import org.apache.spark.sql.internal.SQLConf -object AQEUtils extends SQLConfHelper { +object AQEUtils { // Analyze the given plan and calculate the required distribution of this plan w.r.t. the // user-specified repartition. @@ -63,7 +63,9 @@ object AQEUtils extends SQLConfHelper { // This method is invoked after optimizing skewed join in case we change final stage // output partitioning. def ensureRequiredDistribution( - plan: SparkPlan, distribution: Option[Distribution]): SparkPlan = distribution match { + plan: SparkPlan, + distribution: Option[Distribution], + conf: SQLConf): SparkPlan = distribution match { case Some(d) if !plan.outputPartitioning.satisfies(d) => val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions) val shuffleOrigin = if (d.requiredNumPartitions.isDefined) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 148bb8f28032e..9f5581e547cee 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -179,6 +179,25 @@ case class AdaptiveSparkPlanExec( optimized } + def prepareQueryStages( + plan: SparkPlan, + optimizeSkewedJoin: Boolean): SparkPlan = { + if (optimizeSkewedJoin) { + AQEUtils.ensureRequiredDistribution( + applyPhysicalRules( + plan, + preprocessingRules ++ queryStagePreparationRules(true), + Some((planChangeLogger, "AQE Replanning"))), + requiredDistribution, + conf) + } else { + applyPhysicalRules( + plan, + preprocessingRules ++ queryStagePreparationRules(), + Some((planChangeLogger, "AQE Replanning"))) + } + } + @transient private val costEvaluator = conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) @@ -664,19 +683,8 @@ case class AdaptiveSparkPlanExec( logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - - val optimizedPhysicalPlan = applyPhysicalRules( - sparkPlan, - preprocessingRules ++ queryStagePreparationRules(), - Some((planChangeLogger, "AQE Replanning"))) - - val optimizedWithSkewedJoin = - AQEUtils.ensureRequiredDistribution( - applyPhysicalRules( - sparkPlan, - preprocessingRules ++ queryStagePreparationRules(true), - Some((planChangeLogger, "AQE Replanning"))), - requiredDistribution) + val optimizedPhysicalPlan = prepareQueryStages(sparkPlan, false) + val optimizedWithSkewedJoin = prepareQueryStages(sparkPlan, true) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index dcdc67360f98c..0b4a1c2b5a557 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -22,8 +22,9 @@ import scala.collection.mutable import org.apache.commons.io.FileUtils import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleOrigin} +import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.internal.SQLConf @@ -48,9 +49,7 @@ import org.apache.spark.sql.internal.SQLConf * (L3, R3-1), (L3, R3-2), * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2) */ -object OptimizeSkewedJoin extends AQEShuffleReadRule { - - override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS) +object OptimizeSkewedJoin extends Rule[SparkPlan] { /** * A partition is considered as a skewed partition if its size is larger than the median @@ -259,7 +258,7 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule { object ShuffleStage { def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match { case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined && - isSupported(s.shuffle) => + s.shuffle.shuffleOrigin == ENSURE_REQUIREMENTS => Some(s) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index 621fb01832ccd..eaae94d9d5628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -67,17 +67,17 @@ case class SkewJoinAwareCost( */ case class SimpleCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator { override def evaluateCost(plan: SparkPlan): Cost = { - val shuffleNumber = plan.collect { + val numShuffles = plan.collect { case s: ShuffleExchangeLike => s }.size if (forceOptimizeSkewedJoin) { - val skewJoinNumber = plan.collect { + val numSkewJoins = plan.collect { case j: ShuffledJoin if j.isSkewJoin => j }.size - SkewJoinAwareCost(shuffleNumber, skewJoinNumber) + SkewJoinAwareCost(numShuffles, numSkewJoins) } else { - SimpleCost(shuffleNumber) + SimpleCost(numShuffles) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index b50d271e41428..4368e26e71fcc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1949,9 +1949,10 @@ class AdaptiveQueryExecSuite s"JOIN skewData2 ON key1 = key2 GROUP BY key1") val shuffles1 = findTopLevelShuffle(adaptive1) assert(shuffles1.size == 3) + // the head shuffle is from second EnsureRequirements in queryStagePreparationRules assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS) val smj1 = findTopLevelSortMergeJoin(adaptive1) - assert(smj1.size == 1 && smj1.exists(_.isSkewJoin)) + assert(smj1.size == 1 && smj1.head.isSkewJoin) // only check required distribution val (_, adaptive2) = @@ -1970,7 +1971,7 @@ class AdaptiveQueryExecSuite assert(shuffles2.size == 2) } val smj2 = findTopLevelSortMergeJoin(adaptive2) - assert(smj2.size == 1 && smj2.exists(_.isSkewJoin)) + assert(smj2.size == 1 && smj2.head.isSkewJoin) } } } From bc39694258c88e338d41df7db51aa6b123f0a32a Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 26 Aug 2021 16:33:09 +0800 Subject: [PATCH 27/37] address comment --- .../adaptive/AdaptiveSparkPlanExec.scala | 22 ++++++------------- .../adaptive/AdaptiveQueryExecSuite.scala | 16 ++++++-------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 9f5581e547cee..8c57af999f2f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -100,8 +100,7 @@ case class AdaptiveSparkPlanExec( // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. - private def queryStagePreparationRules( - optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = { + private def queryStagePreparationRules(optimizeSkewedJoin: Boolean): Seq[Rule[SparkPlan]] = { val optimizeSkewedJoinRules = if (optimizeSkewedJoin) { Seq(OptimizeSkewedJoin, // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change @@ -182,20 +181,13 @@ case class AdaptiveSparkPlanExec( def prepareQueryStages( plan: SparkPlan, optimizeSkewedJoin: Boolean): SparkPlan = { - if (optimizeSkewedJoin) { - AQEUtils.ensureRequiredDistribution( - applyPhysicalRules( - plan, - preprocessingRules ++ queryStagePreparationRules(true), - Some((planChangeLogger, "AQE Replanning"))), - requiredDistribution, - conf) - } else { + AQEUtils.ensureRequiredDistribution( applyPhysicalRules( plan, - preprocessingRules ++ queryStagePreparationRules(), - Some((planChangeLogger, "AQE Replanning"))) - } + preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin), + Some((planChangeLogger, "AQE Replanning"))), + requiredDistribution, + conf) } @transient private val costEvaluator = @@ -206,7 +198,7 @@ case class AdaptiveSparkPlanExec( @transient val initialPlan = context.session.withActive { applyPhysicalRules( - inputPlan, queryStagePreparationRules(), Some((planChangeLogger, "AQE Preparations"))) + inputPlan, queryStagePreparationRules(false), Some((planChangeLogger, "AQE Preparations"))) } @volatile private var currentPhysicalPlan = initialPlan diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 4368e26e71fcc..548ba8706084a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -96,12 +96,6 @@ class AdaptiveQueryExecSuite (dfAdaptive.queryExecution.sparkPlan, adaptivePlan) } - private def findTopLevelShuffle(plan: SparkPlan): Seq[ShuffleExchangeExec] = { - collect(plan) { - case s: ShuffleExchangeExec => s - } - } - private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = { collect(plan) { case j: BroadcastHashJoinExec => j @@ -1947,9 +1941,11 @@ class AdaptiveQueryExecSuite val (_, adaptive1) = runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " + s"JOIN skewData2 ON key1 = key2 GROUP BY key1") - val shuffles1 = findTopLevelShuffle(adaptive1) + val shuffles1 = collect(adaptive1) { + case s: ShuffleExchangeExec => s + } assert(shuffles1.size == 3) - // the head shuffle is from second EnsureRequirements in queryStagePreparationRules + // shuffles1.head is the top-level shuffle under the Aggregate operator assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS) val smj1 = findTopLevelSortMergeJoin(adaptive1) assert(smj1.size == 1 && smj1.head.isSkewJoin) @@ -1958,7 +1954,9 @@ class AdaptiveQueryExecSuite val (_, adaptive2) = runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " + s"JOIN skewData2 ON key1 = key2") - val shuffles2 = findTopLevelShuffle(adaptive2) + val shuffles2 = collect(adaptive2) { + case s: ShuffleExchangeExec => s + } if (hasRequiredDistribution) { assert(shuffles2.size == 3) val finalShuffle = shuffles2.head From bb2e71375717ae9ed28ac0cb255e495dc1e334d8 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 2 Sep 2021 13:23:06 +0800 Subject: [PATCH 28/37] address comment --- .../sql/execution/adaptive/AQEUtils.scala | 19 ------- .../adaptive/AdaptiveSparkPlanExec.scala | 15 +++--- .../exchange/EnsureRequirements.scala | 54 +++++++++++++------ 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala index 40b46f0100dc7..277af212d81f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec} -import org.apache.spark.sql.internal.SQLConf object AQEUtils { @@ -58,22 +57,4 @@ object AQEUtils { } case _ => Some(UnspecifiedDistribution) } - - // Add an extra shuffle if input plan does not satisfy the required distribution. - // This method is invoked after optimizing skewed join in case we change final stage - // output partitioning. - def ensureRequiredDistribution( - plan: SparkPlan, - distribution: Option[Distribution], - conf: SQLConf): SparkPlan = distribution match { - case Some(d) if !plan.outputPartitioning.satisfies(d) => - val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions) - val shuffleOrigin = if (d.requiredNumPartitions.isDefined) { - REPARTITION_BY_NUM - } else { - REPARTITION_BY_COL - } - ShuffleExchangeExec(d.createPartitioning(numPartitions), plan, shuffleOrigin) - case _ => plan - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 8c57af999f2f8..64a08b322b7de 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -105,7 +105,7 @@ case class AdaptiveSparkPlanExec( Seq(OptimizeSkewedJoin, // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change // output partitioning, make sure we have right distribution. - EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)) + EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)) } else { Nil } @@ -116,7 +116,7 @@ case class AdaptiveSparkPlanExec( // the final plan, but we do need to respect the user-specified repartition. Here we ask // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work // around this case. - EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), + EnsureRequirements(requiredDistribution.isDefined, requiredDistribution), RemoveRedundantSorts, DisableUnnecessaryBucketedScan ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules @@ -181,13 +181,10 @@ case class AdaptiveSparkPlanExec( def prepareQueryStages( plan: SparkPlan, optimizeSkewedJoin: Boolean): SparkPlan = { - AQEUtils.ensureRequiredDistribution( - applyPhysicalRules( - plan, - preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin), - Some((planChangeLogger, "AQE Replanning"))), - requiredDistribution, - conf) + applyPhysicalRules( + plan, + preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin), + Some((planChangeLogger, "AQE Replanning"))) } @transient private val costEvaluator = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index 23716f1081d34..35ca0b3c3ef92 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -38,8 +38,13 @@ import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoin * but can be false in AQE when AQE optimization may change the plan * output partitioning and need to retain the user-specified * repartition shuffles in the plan. + * @param requiredDistribution The root required distribution we should ensure. This value is used + * in AQE in case we change final stage output partitioning. */ -case class EnsureRequirements(optimizeOutRepartition: Boolean = true) extends Rule[SparkPlan] { +case class EnsureRequirements( + optimizeOutRepartition: Boolean = true, + requiredDistribution: Option[Distribution] = None) + extends Rule[SparkPlan] { private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = { val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution @@ -254,25 +259,40 @@ case class EnsureRequirements(optimizeOutRepartition: Boolean = true) extends Ru } } - def apply(plan: SparkPlan): SparkPlan = plan.transformUp { - case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin) + def apply(plan: SparkPlan): SparkPlan = { + val newPlan = plan.transformUp { + case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin) if optimizeOutRepartition && (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) => - def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = { - partitioning match { - case lower: HashPartitioning if upper.semanticEquals(lower) => true - case lower: PartitioningCollection => - lower.partitionings.exists(hasSemanticEqualPartitioning) - case _ => false + def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = { + partitioning match { + case lower: HashPartitioning if upper.semanticEquals(lower) => true + case lower: PartitioningCollection => + lower.partitionings.exists(hasSemanticEqualPartitioning) + case _ => false + } + } + if (hasSemanticEqualPartitioning(child.outputPartitioning)) { + child + } else { + operator } - } - if (hasSemanticEqualPartitioning(child.outputPartitioning)) { - child - } else { - operator - } - case operator: SparkPlan => - ensureDistributionAndOrdering(reorderJoinPredicates(operator)) + case operator: SparkPlan => + ensureDistributionAndOrdering(reorderJoinPredicates(operator)) + } + + requiredDistribution match { + case Some(d) if !newPlan.outputPartitioning.satisfies(d) => + val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions) + val shuffleOrigin = if (d.requiredNumPartitions.isDefined) { + REPARTITION_BY_NUM + } else { + REPARTITION_BY_COL + } + ShuffleExchangeExec(d.createPartitioning(numPartitions), newPlan, shuffleOrigin) + + case _ => newPlan + } } } From d3f013114554f951bd49ade34ce8f8b9b64a1ece Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 2 Sep 2021 13:26:05 +0800 Subject: [PATCH 29/37] nit --- .../spark/sql/execution/exchange/EnsureRequirements.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index 35ca0b3c3ef92..363f9751fc963 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -262,8 +262,8 @@ case class EnsureRequirements( def apply(plan: SparkPlan): SparkPlan = { val newPlan = plan.transformUp { case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin) - if optimizeOutRepartition && - (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) => + if optimizeOutRepartition && + (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) => def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = { partitioning match { case lower: HashPartitioning if upper.semanticEquals(lower) => true From 4712986d217e515197a4f6bee1ae9790fda62087 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 2 Sep 2021 13:46:49 +0800 Subject: [PATCH 30/37] nit --- .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 64a08b322b7de..654a85bde657f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -116,7 +116,7 @@ case class AdaptiveSparkPlanExec( // the final plan, but we do need to respect the user-specified repartition. Here we ask // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work // around this case. - EnsureRequirements(requiredDistribution.isDefined, requiredDistribution), + EnsureRequirements(requiredDistribution.isDefined), RemoveRedundantSorts, DisableUnnecessaryBucketedScan ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules From 23ebea0a408469f5c2a70e04982ffa6261dae96c Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Sun, 5 Sep 2021 21:32:44 +0800 Subject: [PATCH 31/37] address comment --- .../adaptive/AdaptiveSparkPlanExec.scala | 99 +++++++------------ .../adaptive/OptimizeSkewedJoin.scala | 22 ++++- .../exchange/EnsureRequirements.scala | 60 ++++++----- 3 files changed, 89 insertions(+), 92 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 654a85bde657f..eb099a0409dec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -97,30 +97,26 @@ case class AdaptiveSparkPlanExec( AQEUtils.getRequiredDistribution(inputPlan) } + @transient private val costEvaluator = + conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { + case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) + case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) + } + // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. - private def queryStagePreparationRules(optimizeSkewedJoin: Boolean): Seq[Rule[SparkPlan]] = { - val optimizeSkewedJoinRules = if (optimizeSkewedJoin) { - Seq(OptimizeSkewedJoin, - // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change - // output partitioning, make sure we have right distribution. - EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)) - } else { - Nil - } - - Seq( - RemoveRedundantProjects, - // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for - // the final plan, but we do need to respect the user-specified repartition. Here we ask - // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work - // around this case. - EnsureRequirements(requiredDistribution.isDefined), - RemoveRedundantSorts, - DisableUnnecessaryBucketedScan - ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules - } + @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( + RemoveRedundantProjects, + // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for + // the final plan, but we do need to respect the user-specified repartition. Here we ask + // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work + // around this case. + EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), + RemoveRedundantSorts, + DisableUnnecessaryBucketedScan, + OptimizeSkewedJoin(requiredDistribution, costEvaluator) + ) ++ context.session.sessionState.queryStagePrepRules // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. @@ -178,24 +174,9 @@ case class AdaptiveSparkPlanExec( optimized } - def prepareQueryStages( - plan: SparkPlan, - optimizeSkewedJoin: Boolean): SparkPlan = { - applyPhysicalRules( - plan, - preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin), - Some((planChangeLogger, "AQE Replanning"))) - } - - @transient private val costEvaluator = - conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match { - case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf) - case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)) - } - @transient val initialPlan = context.session.withActive { applyPhysicalRules( - inputPlan, queryStagePreparationRules(false), Some((planChangeLogger, "AQE Preparations"))) + inputPlan, queryStagePreparationRules, Some((planChangeLogger, "AQE Preparations"))) } @volatile private var currentPhysicalPlan = initialPlan @@ -315,24 +296,14 @@ case class AdaptiveSparkPlanExec( // plans are updated, we can clear the query stage list because at this point the two plans // are semantically and physically in sync again. val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace) - val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan) - // We pick the first newPhysicalPlan if have the same cost otherwise pick smaller cost one - val (preferredNewPhysicalPlan, newCost) = - newPhysicalPlans - .map(plan => (plan, costEvaluator.evaluateCost(plan))) - .reduce { (last, current) => - if (current._2 < last._2) { - current - } else { - last - } - } + val (newPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan) val origCost = costEvaluator.evaluateCost(currentPhysicalPlan) + val newCost = costEvaluator.evaluateCost(newPhysicalPlan) if (newCost < origCost || - (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) { - logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan") - cleanUpTempTags(preferredNewPhysicalPlan) - currentPhysicalPlan = preferredNewPhysicalPlan + (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { + logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan") + cleanUpTempTags(newPhysicalPlan) + currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan stagesToReplace = Seq.empty[QueryStageExec] } @@ -668,12 +639,14 @@ case class AdaptiveSparkPlanExec( * Re-optimize and run physical planning on the current logical plan based on the latest stats. */ private def reOptimize( - logicalPlan: LogicalPlan): (Seq[SparkPlan], LogicalPlan) = context.qe.withCteMap { + logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = context.qe.withCteMap { logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() - val optimizedPhysicalPlan = prepareQueryStages(sparkPlan, false) - val optimizedWithSkewedJoin = prepareQueryStages(sparkPlan, true) + val newPlan = applyPhysicalRules( + sparkPlan, + preprocessingRules ++ queryStagePreparationRules, + Some((planChangeLogger, "AQE Replanning"))) // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will // add the `BroadcastExchangeExec` node manually in the DPP subquery, @@ -682,19 +655,13 @@ case class AdaptiveSparkPlanExec( // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery. // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule. - def updateBroadcastExchange(plan: SparkPlan): SparkPlan = currentPhysicalPlan match { + val finalPlan = currentPhysicalPlan match { case b: BroadcastExchangeLike - if (!plan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(plan)) - case _ => plan + if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan)) + case _ => newPlan } - val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) { - updateBroadcastExchange(optimizedPhysicalPlan) :: Nil - } else { - updateBroadcastExchange(optimizedWithSkewedJoin) :: - updateBroadcastExchange(optimizedPhysicalPlan) :: Nil - } - (newPhysicalPlans, optimized) + (finalPlan, optimized) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 0b4a1c2b5a557..72cc3c069f6bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -22,9 +22,10 @@ import scala.collection.mutable import org.apache.commons.io.FileUtils import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.physical.Distribution import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements} import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.internal.SQLConf @@ -49,7 +50,10 @@ import org.apache.spark.sql.internal.SQLConf * (L3, R3-1), (L3, R3-2), * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2) */ -object OptimizeSkewedJoin extends Rule[SparkPlan] { +case class OptimizeSkewedJoin( + requiredDistribution: Option[Distribution], + costEvaluator: CostEvaluator) + extends Rule[SparkPlan] { /** * A partition is considered as a skewed partition if its size is larger than the median @@ -249,7 +253,19 @@ object OptimizeSkewedJoin extends Rule[SparkPlan] { // SHJ // Shuffle // Shuffle - optimizeSkewJoin(plan) + val optimized = + EnsureRequirements(requiredDistribution.isDefined, requiredDistribution) + .apply(optimizeSkewJoin(plan)) + val originCost = costEvaluator.evaluateCost(plan) + val optimizedCost = costEvaluator.evaluateCost(optimized) + // two cases we will pick new plan: + // 1. optimize the skew join without extra shuffle + // 2. optimize the skew join with extra shuffle but the costEvaluator think it's better + if (optimizedCost < originCost || (originCost == optimizedCost && optimized != plan)) { + optimized + } else { + plan + } } else { plan } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index 363f9751fc963..c2a2f8d2df023 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -46,15 +46,15 @@ case class EnsureRequirements( requiredDistribution: Option[Distribution] = None) extends Rule[SparkPlan] { - private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = { - val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution - val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering - var children: Seq[SparkPlan] = operator.children - assert(requiredChildDistributions.length == children.length) - assert(requiredChildOrderings.length == children.length) - + private def ensureDistributionAndOrdering( + originChildren: Seq[SparkPlan], + requiredChildDistributions: Seq[Distribution], + requiredChildOrderings: Seq[Seq[SortOrder]], + isRootDistribution: Boolean): Seq[SparkPlan] = { + assert(requiredChildDistributions.length == originChildren.length) + assert(requiredChildOrderings.length == originChildren.length) // Ensure that the operator's children satisfy their output distribution requirements. - children = children.zip(requiredChildDistributions).map { + var children = originChildren.zip(requiredChildDistributions).map { case (child, distribution) if child.outputPartitioning.satisfies(distribution) => child case (child, BroadcastDistribution(mode)) => @@ -62,7 +62,16 @@ case class EnsureRequirements( case (child, distribution) => val numPartitions = distribution.requiredNumPartitions .getOrElse(conf.numShufflePartitions) - ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child) + val shuffleOrigin = if (isRootDistribution) { + if (distribution.requiredNumPartitions.isDefined) { + REPARTITION_BY_NUM + } else { + REPARTITION_BY_COL + } + } else { + ENSURE_REQUIREMENTS + } + ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child, shuffleOrigin) } // Get the indexes of children which have specified distribution requirements and need to have @@ -83,7 +92,7 @@ case class EnsureRequirements( index => requiredChildDistributions(index).requiredNumPartitions }.toSet assert(numPartitionsSet.size <= 1, - s"$operator have incompatible requirements of the number of partitions for its children") + s"$requiredChildDistributions have incompatible requirements of the number of partitions") numPartitionsSet.headOption } @@ -138,7 +147,7 @@ case class EnsureRequirements( } } - operator.withNewChildren(children) + children } private def reorder( @@ -279,20 +288,25 @@ case class EnsureRequirements( } case operator: SparkPlan => - ensureDistributionAndOrdering(reorderJoinPredicates(operator)) + val reordered = reorderJoinPredicates(operator) + val newChildren = ensureDistributionAndOrdering( + reordered.children, + reordered.requiredChildDistribution, + reordered.requiredChildOrdering, + false) + reordered.withNewChildren(newChildren) } - requiredDistribution match { - case Some(d) if !newPlan.outputPartitioning.satisfies(d) => - val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions) - val shuffleOrigin = if (d.requiredNumPartitions.isDefined) { - REPARTITION_BY_NUM - } else { - REPARTITION_BY_COL - } - ShuffleExchangeExec(d.createPartitioning(numPartitions), newPlan, shuffleOrigin) - - case _ => newPlan + if (requiredDistribution.isDefined) { + val finalPlan = ensureDistributionAndOrdering( + newPlan :: Nil, + requiredDistribution.get :: Nil, + Seq(Nil), + true) + assert(finalPlan.size == 1) + finalPlan.head + } else { + newPlan } } } From ef0765f820185d05902ee35bd3bcc7851c4a1457 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 7 Sep 2021 16:00:14 +0800 Subject: [PATCH 32/37] pass EnsureRequirements --- .../sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 4 +++- .../spark/sql/execution/adaptive/OptimizeSkewedJoin.scala | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index eb099a0409dec..5284c4979bb1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -115,7 +115,9 @@ case class AdaptiveSparkPlanExec( EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), RemoveRedundantSorts, DisableUnnecessaryBucketedScan, - OptimizeSkewedJoin(requiredDistribution, costEvaluator) + OptimizeSkewedJoin( + EnsureRequirements(requiredDistribution.isDefined, requiredDistribution), + costEvaluator) ) ++ context.session.sessionState.queryStagePrepRules // A list of physical optimizer rules to be applied to a new stage before its execution. These diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 72cc3c069f6bd..58e88a6931d1c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -51,7 +51,7 @@ import org.apache.spark.sql.internal.SQLConf * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2) */ case class OptimizeSkewedJoin( - requiredDistribution: Option[Distribution], + ensureRequirements: EnsureRequirements, costEvaluator: CostEvaluator) extends Rule[SparkPlan] { @@ -253,9 +253,7 @@ case class OptimizeSkewedJoin( // SHJ // Shuffle // Shuffle - val optimized = - EnsureRequirements(requiredDistribution.isDefined, requiredDistribution) - .apply(optimizeSkewJoin(plan)) + val optimized = ensureRequirements.apply(optimizeSkewJoin(plan)) val originCost = costEvaluator.evaluateCost(plan) val optimizedCost = costEvaluator.evaluateCost(optimized) // two cases we will pick new plan: From 76c363daacfcf35a08378859c710f3bd916bc153 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 7 Sep 2021 16:03:12 +0800 Subject: [PATCH 33/37] simplify --- .../spark/sql/execution/adaptive/OptimizeSkewedJoin.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 58e88a6931d1c..641baad9877a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -259,7 +259,7 @@ case class OptimizeSkewedJoin( // two cases we will pick new plan: // 1. optimize the skew join without extra shuffle // 2. optimize the skew join with extra shuffle but the costEvaluator think it's better - if (optimizedCost < originCost || (originCost == optimizedCost && optimized != plan)) { + if (optimizedCost <= originCost) { optimized } else { plan From 89610842f656d87f531a23337d027f6ec1f1ba08 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 7 Sep 2021 16:42:53 +0800 Subject: [PATCH 34/37] nit --- .../apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 641baad9877a5..2fe5b18a75ec8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -22,7 +22,6 @@ import scala.collection.mutable import org.apache.commons.io.FileUtils import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.Distribution import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements} From 5ba73c4e2958e807b11f5de4f9be56ea4b854964 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 7 Sep 2021 18:01:51 +0800 Subject: [PATCH 35/37] EnsureRequirements --- .../adaptive/AdaptiveSparkPlanExec.scala | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 5284c4979bb1e..13c9528323ae8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -106,19 +106,21 @@ case class AdaptiveSparkPlanExec( // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. - @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( - RemoveRedundantProjects, + @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = { // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for // the final plan, but we do need to respect the user-specified repartition. Here we ask // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work // around this case. - EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined), - RemoveRedundantSorts, - DisableUnnecessaryBucketedScan, - OptimizeSkewedJoin( - EnsureRequirements(requiredDistribution.isDefined, requiredDistribution), - costEvaluator) - ) ++ context.session.sessionState.queryStagePrepRules + val ensureRequirements = + EnsureRequirements(requiredDistribution.isDefined, requiredDistribution) + Seq( + RemoveRedundantProjects, + ensureRequirements, + RemoveRedundantSorts, + DisableUnnecessaryBucketedScan, + OptimizeSkewedJoin(ensureRequirements, costEvaluator) + ) ++ context.session.sessionState.queryStagePrepRules + } // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. From ca6332167f20ca85c4ae23100a475530f9118a8e Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Thu, 9 Sep 2021 10:38:48 +0800 Subject: [PATCH 36/37] pull out shuffle origin --- .../exchange/EnsureRequirements.scala | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index c2a2f8d2df023..687b4c4490e09 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -50,7 +50,7 @@ case class EnsureRequirements( originChildren: Seq[SparkPlan], requiredChildDistributions: Seq[Distribution], requiredChildOrderings: Seq[Seq[SortOrder]], - isRootDistribution: Boolean): Seq[SparkPlan] = { + shuffleOrigin: ShuffleOrigin): Seq[SparkPlan] = { assert(requiredChildDistributions.length == originChildren.length) assert(requiredChildOrderings.length == originChildren.length) // Ensure that the operator's children satisfy their output distribution requirements. @@ -62,15 +62,6 @@ case class EnsureRequirements( case (child, distribution) => val numPartitions = distribution.requiredNumPartitions .getOrElse(conf.numShufflePartitions) - val shuffleOrigin = if (isRootDistribution) { - if (distribution.requiredNumPartitions.isDefined) { - REPARTITION_BY_NUM - } else { - REPARTITION_BY_COL - } - } else { - ENSURE_REQUIREMENTS - } ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child, shuffleOrigin) } @@ -293,16 +284,21 @@ case class EnsureRequirements( reordered.children, reordered.requiredChildDistribution, reordered.requiredChildOrdering, - false) + ENSURE_REQUIREMENTS) reordered.withNewChildren(newChildren) } if (requiredDistribution.isDefined) { + val shuffleOrigin = if (requiredDistribution.get.requiredNumPartitions.isDefined) { + REPARTITION_BY_NUM + } else { + REPARTITION_BY_COL + } val finalPlan = ensureDistributionAndOrdering( newPlan :: Nil, requiredDistribution.get :: Nil, Seq(Nil), - true) + shuffleOrigin) assert(finalPlan.size == 1) finalPlan.head } else { From f5e4b911ea608cdf863215feb6e13996a57cb294 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Sat, 11 Sep 2021 18:00:19 +0800 Subject: [PATCH 37/37] address comment --- .../sql/execution/adaptive/simpleCosting.scala | 6 +++++- .../exchange/EnsureRequirements.scala | 18 +++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index eaae94d9d5628..864563be38557 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -36,7 +36,11 @@ case class SimpleCost(value: Long) extends Cost { } /** - * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number + * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number. + * + * We always pick the cost which has more skew join even if it introduces one or more extra shuffle. + * Otherwise, if two costs have the same number of skew join or no skew join, we will pick the one + * with small number of shuffle. */ case class SkewJoinAwareCost( numShuffles: Int, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index 687b4c4490e09..86b2344629d26 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -47,14 +47,14 @@ case class EnsureRequirements( extends Rule[SparkPlan] { private def ensureDistributionAndOrdering( - originChildren: Seq[SparkPlan], + originalChildren: Seq[SparkPlan], requiredChildDistributions: Seq[Distribution], requiredChildOrderings: Seq[Seq[SortOrder]], shuffleOrigin: ShuffleOrigin): Seq[SparkPlan] = { - assert(requiredChildDistributions.length == originChildren.length) - assert(requiredChildOrderings.length == originChildren.length) + assert(requiredChildDistributions.length == originalChildren.length) + assert(requiredChildOrderings.length == originalChildren.length) // Ensure that the operator's children satisfy their output distribution requirements. - var children = originChildren.zip(requiredChildDistributions).map { + var newChildren = originalChildren.zip(requiredChildDistributions).map { case (child, distribution) if child.outputPartitioning.satisfies(distribution) => child case (child, BroadcastDistribution(mode)) => @@ -74,7 +74,7 @@ case class EnsureRequirements( }.map(_._2) val childrenNumPartitions = - childrenIndexes.map(children(_).outputPartitioning.numPartitions).toSet + childrenIndexes.map(newChildren(_).outputPartitioning.numPartitions).toSet if (childrenNumPartitions.size > 1) { // Get the number of partitions which is explicitly required by the distributions. @@ -92,7 +92,7 @@ case class EnsureRequirements( // 1. We should avoid shuffling these children. // 2. We should have a reasonable parallelism. val nonShuffleChildrenNumPartitions = - childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec]) + childrenIndexes.map(newChildren).filterNot(_.isInstanceOf[ShuffleExchangeExec]) .map(_.outputPartitioning.numPartitions) val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) { if (nonShuffleChildrenNumPartitions.length == childrenIndexes.length) { @@ -111,7 +111,7 @@ case class EnsureRequirements( val targetNumPartitions = requiredNumPartitions.getOrElse(expectedChildrenNumPartitions) - children = children.zip(requiredChildDistributions).zipWithIndex.map { + newChildren = newChildren.zip(requiredChildDistributions).zipWithIndex.map { case ((child, distribution), index) if childrenIndexes.contains(index) => if (child.outputPartitioning.numPartitions == targetNumPartitions) { child @@ -129,7 +129,7 @@ case class EnsureRequirements( } // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings: - children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) => + newChildren = newChildren.zip(requiredChildOrderings).map { case (child, requiredOrdering) => // If child.outputOrdering already satisfies the requiredOrdering, we do not need to sort. if (SortOrder.orderingSatisfies(child.outputOrdering, requiredOrdering)) { child @@ -138,7 +138,7 @@ case class EnsureRequirements( } } - children + newChildren } private def reorder(