From 49e8bd9d7be91be1f6b23e1e929623cffbd126d2 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 8 Jun 2021 13:56:59 +0800
Subject: [PATCH 01/37] Support optimize skew join even if introduce extra
 shuffle

---
 .../apache/spark/sql/internal/SQLConf.scala   |  6 ++
 .../adaptive/AdaptiveSparkPlanExec.scala      | 55 ++++++++++++++++--
 .../adaptive/OptimizeSkewedJoin.scala         | 19 +-----
 .../adaptive/SkewJoinAwareCost.scala          | 58 +++++++++++++++++++
 .../adaptive/AdaptiveQueryExecSuite.scala     | 35 +++++++++++
 5 files changed, 153 insertions(+), 20 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 3aed3c274fc76..f85a0b7d4d189 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -653,6 +653,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val ADAPTIVE_FORCE_ENABLE_SKEW_JOIN = buildConf("spark.sql.adaptive.forceEnableSkewJoin")
+    .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.")
+    .version("3.2.0")
+    .booleanConf
+    .createWithDefault(false)
+
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index cbf70e37ce961..088cb677e5a82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -88,6 +88,12 @@ case class AdaptiveSparkPlanExec(
   private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
     RemoveRedundantProjects,
     EnsureRequirements,
+    // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of
+    // skew join and extra shuffle nodes.
+    OptimizeSkewedJoin,
+    // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the
+    // output partitioning
+    EnsureRequirements,
     RemoveRedundantSorts,
     DisableUnnecessaryBucketedScan
   ) ++ context.session.sessionState.queryStagePrepRules
@@ -97,8 +103,6 @@ case class AdaptiveSparkPlanExec(
   @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
     PlanAdaptiveDynamicPruningFilters(this),
     ReuseAdaptiveSubquery(context.subqueryCache),
-    // Skew join does not handle `CustomShuffleReader` so needs to be applied first.
-    OptimizeSkewedJoin,
     OptimizeSkewInRebalancePartitions,
     CoalesceShufflePartitions(context.session),
     // `OptimizeLocalShuffleReader` needs to make use of 'CustomShuffleReaderExec.partitionSpecs'
@@ -113,6 +117,19 @@ case class AdaptiveSparkPlanExec(
     CollapseCodegenStages()
   )
 
+  // OptimizeSkewedJoin has moved into preparation rules, so we should make
+  // finalPreparationStageRules same as finalStageOptimizerRules
+  private def finalPreparationStageRules: Seq[Rule[SparkPlan]] = {
+    val origins = inputPlan.collect {
+      case s: ShuffleExchangeLike => s.shuffleOrigin
+    }
+    (preprocessingRules ++ queryStagePreparationRules).filter {
+      case c: CustomShuffleReaderRule =>
+        origins.forall(c.supportedShuffleOrigins.contains)
+      case _ => true
+    }
+  }
+
   // The partitioning of the query output depends on the shuffle(s) in the final stage. If the
   // original plan contains a repartition operator, we need to preserve the specified partitioning,
   // whether or not the repartition-introduced shuffle is optimized out because of an underlying
@@ -130,7 +147,12 @@ case class AdaptiveSparkPlanExec(
     }
   }
 
-  @transient private val costEvaluator = SimpleCostEvaluator
+  @transient private val costEvaluator =
+    if (conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) {
+      SkewJoinAwareCostEvaluator
+    } else {
+      SimpleCostEvaluator
+    }
 
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(
@@ -593,6 +615,25 @@ case class AdaptiveSparkPlanExec(
     logicalPlan
   }
 
+  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
+    sparkPlan match {
+      // avoid top level node is Exchange
+      case _: Exchange => false
+      case plan =>
+        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
+        // and all query stages are materialized.
+        plan.find {
+          case p if p.children.exists(
+            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
+            p match {
+              case stage: QueryStageExec if stage.isMaterialized => false
+              case _ => true
+            }
+          case _ => false
+        }.isEmpty
+    }
+  }
+
   /**
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
@@ -600,9 +641,15 @@ case class AdaptiveSparkPlanExec(
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
+    val rules = if (isFinalStage(sparkPlan)) {
+      finalPreparationStageRules
+    } else {
+      preprocessingRules ++ queryStagePreparationRules
+    }
+
     val newPlan = applyPhysicalRules(
       sparkPlan,
-      preprocessingRules ++ queryStagePreparationRules,
+      rules,
       Some((planChangeLogger, "AQE Replanning")))
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index a284016bfb2ef..96c030fcd237b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -23,7 +23,7 @@ import org.apache.commons.io.FileUtils
 
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleOrigin}
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleOrigin}
 import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.internal.SQLConf
 
@@ -52,8 +52,6 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule {
 
   override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS)
 
-  private val ensureRequirements = EnsureRequirements
-
   /**
    * A partition is considered as a skewed partition if its size is larger than the median
    * partition size * SKEW_JOIN_SKEWED_PARTITION_FACTOR and also larger than
@@ -248,18 +246,7 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule {
       //     Shuffle
       //   Sort
       //     Shuffle
-      val optimizePlan = optimizeSkewJoin(plan)
-      val numShuffles = ensureRequirements.apply(optimizePlan).collect {
-        case e: ShuffleExchangeExec => e
-      }.length
-
-      if (numShuffles > 0) {
-        logDebug("OptimizeSkewedJoin rule is not applied due" +
-          " to additional shuffles will be introduced.")
-        plan
-      } else {
-        optimizePlan
-      }
+      optimizeSkewJoin(plan)
     } else {
       plan
     }
@@ -268,7 +255,7 @@ object OptimizeSkewedJoin extends CustomShuffleReaderRule {
 
 private object ShuffleStage {
   def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match {
-    case s: ShuffleQueryStageExec if s.mapStats.isDefined &&
+    case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined &&
         OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) =>
       Some(s)
     case _ => None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
new file mode 100644
index 0000000000000..a2e101216a548
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
+import org.apache.spark.sql.execution.joins.ShuffledJoin
+
+/**
+ * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number
+ */
+case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost {
+  override def compare(that: Cost): Int = that match {
+    case other: SkewJoinAwareCost =>
+      if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) {
+        // If more skew joins are optimized or less shuffle nodes, it means the cost is lower
+        -1
+      } else if (numShuffles > other.numShuffles) {
+        1
+      } else {
+        0
+      }
+    case _ =>
+      throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString)
+  }
+}
+
+/**
+ * A skew join aware implementation of [[CostEvaluator]], which counts the number of
+ * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan.
+ */
+object SkewJoinAwareCostEvaluator extends CostEvaluator {
+  override def evaluateCost(plan: SparkPlan): Cost = {
+    val shuffleNumber = plan.collect {
+      case s: ShuffleExchangeLike => s
+    }.size
+    val skewJoinNumber = plan.collect {
+      case j: ShuffledJoin if j.isSkewJoin => j
+    }.size
+    SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 2343a9236e4f4..7b2eab28c51cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1786,6 +1786,7 @@ class AdaptiveQueryExecSuite
     }
   }
 
+<<<<<<< HEAD
   test("SPARK-35650: Coalesce number of partitions by AEQ") {
     withSQLConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1") {
       Seq("REPARTITION", "REBALANCE(key)")
@@ -1885,4 +1886,38 @@ class AdaptiveQueryExecSuite
       }
     }
   }
+
+  test("SPARK-33832: Support optimize skew join even if introduce extra shuffle") {
+    withSQLConf(
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+      SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
+      SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
+      SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1",
+      SQLConf.SHUFFLE_PARTITIONS.key -> "10",
+      SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN.key -> "true") {
+      withTempView("skewData1", "skewData2") {
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 3 as key1", "id as value1")
+          .createOrReplaceTempView("skewData1")
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 1 as key2", "id as value2")
+          .createOrReplaceTempView("skewData2")
+
+        val (_, adaptive) = runAdaptiveAndVerifyResult(
+          "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1")
+        val smj = findTopLevelSortMergeJoin(adaptive)
+        assert(smj.size == 1 && smj.forall(_.isSkewJoin))
+        checkNumLocalShuffleReaders(adaptive, 3)
+
+        val (_, adaptive2) = runAdaptiveAndVerifyResult(
+          "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
+        val smj2 = findTopLevelSortMergeJoin(adaptive2)
+        assert(smj2.size == 1 && smj2.forall(_.isSkewJoin))
+        checkNumLocalShuffleReaders(adaptive2, 3)
+      }
+    }
+  }
 }

From db77ddd6f743ebc61e0a5539a025128b23eb2814 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 8 Jun 2021 17:42:58 +0800
Subject: [PATCH 02/37] EnsureRequirements

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 088cb677e5a82..216b837f241d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -641,7 +641,7 @@ case class AdaptiveSparkPlanExec(
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-    val rules = if (isFinalStage(sparkPlan)) {
+    val rules = if (isFinalStage(EnsureRequirements.apply(sparkPlan))) {
       finalPreparationStageRules
     } else {
       preprocessingRules ++ queryStagePreparationRules

From a63cd7286e77ee68dad90f54925373b29f3d6398 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 9 Jun 2021 11:48:07 +0800
Subject: [PATCH 03/37] make a new rules

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 216b837f241d8..017ee417f75bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -88,15 +88,20 @@ case class AdaptiveSparkPlanExec(
   private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
     RemoveRedundantProjects,
     EnsureRequirements,
+    RemoveRedundantSorts,
+    DisableUnnecessaryBucketedScan
+  ) ++ context.session.sessionState.queryStagePrepRules
+
+  // This list rules are applied between queryStagePreparationRules and estimate physical plan cost
+  // so that we can support introduce extra shuffle
+  @transient private val queryStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq(
     // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of
     // skew join and extra shuffle nodes.
     OptimizeSkewedJoin,
     // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the
     // output partitioning
-    EnsureRequirements,
-    RemoveRedundantSorts,
-    DisableUnnecessaryBucketedScan
-  ) ++ context.session.sessionState.queryStagePrepRules
+    EnsureRequirements
+  )
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
   // optimizations should be stage-independent.
@@ -117,13 +122,13 @@ case class AdaptiveSparkPlanExec(
     CollapseCodegenStages()
   )
 
-  // OptimizeSkewedJoin has moved into preparation rules, so we should make
-  // finalPreparationStageRules same as finalStageOptimizerRules
-  private def finalPreparationStageRules: Seq[Rule[SparkPlan]] = {
+  // OptimizeSkewedJoin has moved into this rules, so we should follow the finalStageOptimizerRules
+  // for the final stage.
+  private def finalStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = {
     val origins = inputPlan.collect {
       case s: ShuffleExchangeLike => s.shuffleOrigin
     }
-    (preprocessingRules ++ queryStagePreparationRules).filter {
+    queryStagePreparationWithExtraShuffleRules.filter {
       case c: CustomShuffleReaderRule =>
         origins.forall(c.supportedShuffleOrigins.contains)
       case _ => true
@@ -641,15 +646,19 @@ case class AdaptiveSparkPlanExec(
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-    val rules = if (isFinalStage(EnsureRequirements.apply(sparkPlan))) {
-      finalPreparationStageRules
-    } else {
-      preprocessingRules ++ queryStagePreparationRules
-    }
-
     val newPlan = applyPhysicalRules(
       sparkPlan,
-      rules,
+      preprocessingRules ++ queryStagePreparationRules,
+      Some((planChangeLogger, "AQE Replanning")))
+
+    val preparationWithExtraShuffleRules = if (isFinalStage(newPlan)) {
+      finalStagePreparationWithExtraShuffleRules
+    } else {
+      queryStagePreparationWithExtraShuffleRules
+    }
+    val newPlanWithExtraShuffle = applyPhysicalRules(
+      newPlan,
+      preparationWithExtraShuffleRules,
       Some((planChangeLogger, "AQE Replanning")))
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
@@ -661,8 +670,9 @@ case class AdaptiveSparkPlanExec(
     // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
     val finalPlan = currentPhysicalPlan match {
       case b: BroadcastExchangeLike
-        if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
-      case _ => newPlan
+        if (!newPlanWithExtraShuffle.isInstanceOf[BroadcastExchangeLike]) =>
+        b.withNewChildren(Seq(newPlanWithExtraShuffle))
+      case _ => newPlanWithExtraShuffle
     }
 
     (finalPlan, optimized)

From 59a5e4a9ac3f322df504465caba09ec851c99d69 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 10 Jun 2021 13:52:52 +0800
Subject: [PATCH 04/37] fix local reader number

---
 .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 7b2eab28c51cc..d3726ed31f76e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1916,7 +1916,8 @@ class AdaptiveQueryExecSuite
           "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
         val smj2 = findTopLevelSortMergeJoin(adaptive2)
         assert(smj2.size == 1 && smj2.forall(_.isSkewJoin))
-        checkNumLocalShuffleReaders(adaptive2, 3)
+        // top level shuffle reader is local
+        checkNumLocalShuffleReaders(adaptive2, 2)
       }
     }
   }

From 9c985da9870e107311cef0383a2a40703e4f4f07 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 2 Jul 2021 12:13:59 +0800
Subject: [PATCH 05/37] more cost

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 50 +++++++++++--------
 .../adaptive/SkewJoinAwareCost.scala          | 20 +++++---
 .../adaptive/AdaptiveQueryExecSuite.scala     |  2 +-
 3 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 017ee417f75bc..63feccb0fe40f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -153,11 +153,7 @@ case class AdaptiveSparkPlanExec(
   }
 
   @transient private val costEvaluator =
-    if (conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN)) {
-      SkewJoinAwareCostEvaluator
-    } else {
-      SimpleCostEvaluator
-    }
+    SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN))
 
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(
@@ -279,17 +275,26 @@ case class AdaptiveSparkPlanExec(
         // plans are updated, we can clear the query stage list because at this point the two plans
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
-        val (newPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
+        val (reOptimizePhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
+        val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizePhysicalPlan)
         val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
-        val newCost = costEvaluator.evaluateCost(newPhysicalPlan)
-        if (newCost < origCost ||
-            (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) {
+        val newCost = costEvaluator.evaluateCost(reOptimizePhysicalPlan)
+        val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle)
+        def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = {
           logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan")
           cleanUpTempTags(newPhysicalPlan)
           currentPhysicalPlan = newPhysicalPlan
           currentLogicalPlan = newLogicalPlan
           stagesToReplace = Seq.empty[QueryStageExec]
         }
+
+        if (extraShuffleCost < newCost ||
+          (extraShuffleCost == newCost && planWithExtraShuffle != reOptimizePhysicalPlan)) {
+          updateCurrentPlan(planWithExtraShuffle)
+        } else if (newCost < origCost ||
+          (newCost == origCost && currentPhysicalPlan != reOptimizePhysicalPlan)) {
+          updateCurrentPlan(reOptimizePhysicalPlan)
+        }
         // Now that some stages have finished, we can try creating new stages.
         result = createQueryStages(currentPhysicalPlan)
       }
@@ -651,16 +656,6 @@ case class AdaptiveSparkPlanExec(
       preprocessingRules ++ queryStagePreparationRules,
       Some((planChangeLogger, "AQE Replanning")))
 
-    val preparationWithExtraShuffleRules = if (isFinalStage(newPlan)) {
-      finalStagePreparationWithExtraShuffleRules
-    } else {
-      queryStagePreparationWithExtraShuffleRules
-    }
-    val newPlanWithExtraShuffle = applyPhysicalRules(
-      newPlan,
-      preparationWithExtraShuffleRules,
-      Some((planChangeLogger, "AQE Replanning")))
-
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
     // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated
@@ -670,14 +665,25 @@ case class AdaptiveSparkPlanExec(
     // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
     val finalPlan = currentPhysicalPlan match {
       case b: BroadcastExchangeLike
-        if (!newPlanWithExtraShuffle.isInstanceOf[BroadcastExchangeLike]) =>
-        b.withNewChildren(Seq(newPlanWithExtraShuffle))
-      case _ => newPlanWithExtraShuffle
+        if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
+      case _ => newPlan
     }
 
     (finalPlan, optimized)
   }
 
+  private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = {
+    val preparationWithExtraShuffleRules = if (isFinalStage(sparkPlan)) {
+      finalStagePreparationWithExtraShuffleRules
+    } else {
+      queryStagePreparationWithExtraShuffleRules
+    }
+    applyPhysicalRules(
+      sparkPlan,
+      preparationWithExtraShuffleRules,
+      Some((planChangeLogger, "AQE Replanning")))
+  }
+
   /**
    * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
index a2e101216a548..c2e9642171588 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
@@ -25,7 +25,9 @@ import org.apache.spark.sql.execution.joins.ShuffledJoin
 /**
  * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number
  */
-case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost {
+case class SkewJoinAwareCost(
+    numShuffles: Int,
+    numSkewJoins: Int) extends Cost {
   override def compare(that: Cost): Int = that match {
     case other: SkewJoinAwareCost =>
       if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) {
@@ -36,6 +38,7 @@ case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost {
       } else {
         0
       }
+
     case _ =>
       throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString)
   }
@@ -45,14 +48,19 @@ case class SkewJoinAwareCost(numShuffles: Int, numSkewJoins: Int) extends Cost {
  * A skew join aware implementation of [[CostEvaluator]], which counts the number of
  * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan.
  */
-object SkewJoinAwareCostEvaluator extends CostEvaluator {
+case class SkewJoinAwareCostEvaluator(forceOptimizeSkewJoin: Boolean) extends CostEvaluator {
   override def evaluateCost(plan: SparkPlan): Cost = {
     val shuffleNumber = plan.collect {
       case s: ShuffleExchangeLike => s
     }.size
-    val skewJoinNumber = plan.collect {
-      case j: ShuffledJoin if j.isSkewJoin => j
-    }.size
-    SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
+
+    if (forceOptimizeSkewJoin) {
+      val skewJoinNumber = plan.collect {
+        case j: ShuffledJoin if j.isSkewJoin => j
+      }.size
+      SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
+    } else {
+      SimpleCost(shuffleNumber)
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index d3726ed31f76e..6977007943d4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1786,7 +1786,6 @@ class AdaptiveQueryExecSuite
     }
   }
 
-<<<<<<< HEAD
   test("SPARK-35650: Coalesce number of partitions by AEQ") {
     withSQLConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1") {
       Seq("REPARTITION", "REBALANCE(key)")
@@ -1890,6 +1889,7 @@ class AdaptiveQueryExecSuite
   test("SPARK-33832: Support optimize skew join even if introduce extra shuffle") {
     withSQLConf(
       SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.ADAPTIVE_OPTIMIZE_SKEWS_IN_REBALANCE_PARTITIONS_ENABLED.key -> "false",
       SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
       SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
       SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",

From 8bc22ad61f2a76da6a3430f5eebe9646cf5fb841 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 9 Jul 2021 17:06:17 +0800
Subject: [PATCH 06/37] nit

---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala | 11 ++++++-----
 .../execution/adaptive/AdaptiveSparkPlanExec.scala    |  3 ++-
 .../execution/adaptive/AdaptiveQueryExecSuite.scala   |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 35f7f116a51e5..6e3f4909d8b6c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -678,11 +678,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
-  val ADAPTIVE_FORCE_ENABLE_SKEW_JOIN = buildConf("spark.sql.adaptive.forceEnableSkewJoin")
-    .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.")
-    .version("3.2.0")
-    .booleanConf
-    .createWithDefault(false)
+  val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN =
+    buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin")
+      .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.")
+      .version("3.2.0")
+      .booleanConf
+      .createWithDefault(false)
 
   val ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS =
     buildConf("spark.sql.adaptive.customCostEvaluatorClass")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index c21e35ca4674c..b975456b8465e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -155,7 +155,8 @@ case class AdaptiveSparkPlanExec(
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
-      case _ => SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN))
+      case _ =>
+        SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
     }
 
   @transient val initialPlan = context.session.withActive {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index e755c848100d7..5d4cd268816e9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1894,7 +1894,7 @@ class AdaptiveQueryExecSuite
       SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
       SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1",
       SQLConf.SHUFFLE_PARTITIONS.key -> "10",
-      SQLConf.ADAPTIVE_FORCE_ENABLE_SKEW_JOIN.key -> "true") {
+      SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN.key -> "true") {
       withTempView("skewData1", "skewData2") {
         spark
           .range(0, 1000, 1, 10)

From 7734d3eec1078a56f600ce7b6e718824c09a0e3b Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 9 Jul 2021 17:38:23 +0800
Subject: [PATCH 07/37] nit

---
 .../adaptive/AdaptiveSparkPlanExec.scala       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index b975456b8465e..4324eefbe8b64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -281,25 +281,25 @@ case class AdaptiveSparkPlanExec(
         // plans are updated, we can clear the query stage list because at this point the two plans
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
-        val (reOptimizePhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
-        val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizePhysicalPlan)
+        val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
+        val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizationPhysicalPlan)
         val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
-        val newCost = costEvaluator.evaluateCost(reOptimizePhysicalPlan)
+        val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan)
         val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle)
         def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = {
-          logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan")
+          logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan")
           cleanUpTempTags(newPhysicalPlan)
           currentPhysicalPlan = newPhysicalPlan
           currentLogicalPlan = newLogicalPlan
           stagesToReplace = Seq.empty[QueryStageExec]
         }
 
-        if (extraShuffleCost < newCost ||
-          (extraShuffleCost == newCost && planWithExtraShuffle != reOptimizePhysicalPlan)) {
+        if (extraShuffleCost < reOptimizationCost || (extraShuffleCost == reOptimizationCost &&
+          reOptimizationPhysicalPlan != planWithExtraShuffle)) {
           updateCurrentPlan(planWithExtraShuffle)
-        } else if (newCost < origCost ||
-          (newCost == origCost && currentPhysicalPlan != reOptimizePhysicalPlan)) {
-          updateCurrentPlan(reOptimizePhysicalPlan)
+        } else if (reOptimizationCost < origCost ||
+          (reOptimizationCost == origCost && currentPhysicalPlan != reOptimizationPhysicalPlan)) {
+          updateCurrentPlan(reOptimizationPhysicalPlan)
         }
         // Now that some stages have finished, we can try creating new stages.
         result = createQueryStages(currentPhysicalPlan)

From 3dc61a3f6ebb784aac7e739bed34f0028f77c052 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 3 Aug 2021 19:38:46 +0800
Subject: [PATCH 08/37] force optimize skewed join

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 62 ++-----------------
 .../adaptive/SkewJoinAwareCost.scala          |  4 +-
 2 files changed, 6 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index da003b24a9a89..c719e6c4a6524 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -116,9 +116,9 @@ case class AdaptiveSparkPlanExec(
     // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of
     // skew join and extra shuffle nodes.
     OptimizeSkewedJoin,
-    // Add the EnsureRequirements rule here since OptimizeSkewedJoin may change the
-    // output partitioning
-    EnsureRequirements()
+    // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
+    // ensure the output partitioning of OptimizeSkewedJoin is always expected.
+    EnsureRequirements(false)
   )
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
@@ -140,36 +140,6 @@ case class AdaptiveSparkPlanExec(
     CollapseCodegenStages()
   )
 
-  // OptimizeSkewedJoin has moved into this rules, so we should follow the finalStageOptimizerRules
-  // for the final stage.
-  private def finalStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = {
-    val origins = inputPlan.collect {
-      case s: ShuffleExchangeLike => s.shuffleOrigin
-    }
-    queryStagePreparationWithExtraShuffleRules.filter {
-      case c: AQEShuffleReadRule =>
-        origins.forall(c.supportedShuffleOrigins.contains)
-      case _ => true
-    }
-  }
-
-  // The partitioning of the query output depends on the shuffle(s) in the final stage. If the
-  // original plan contains a repartition operator, we need to preserve the specified partitioning,
-  // whether or not the repartition-introduced shuffle is optimized out because of an underlying
-  // shuffle of the same partitioning. Thus, we need to exclude some `CustomShuffleReaderRule`s
-  // from the final stage, depending on the presence and properties of repartition operators.
-  private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] = {
-    val origins = inputPlan.collect {
-      case s: ShuffleExchangeLike => s.shuffleOrigin
-    }
-    val allRules = queryStageOptimizerRules ++ postStageCreationRules
-    allRules.filter {
-      case c: CustomShuffleReaderRule =>
-        origins.forall(c.supportedShuffleOrigins.contains)
-      case _ => true
-    }
-  } ++ context.session.sessionState.postStageCreationRules
-
   private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
     val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
       val applied = rule.apply(latestPlan)
@@ -698,25 +668,6 @@ case class AdaptiveSparkPlanExec(
     logicalPlan
   }
 
-  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
-    sparkPlan match {
-      // avoid top level node is Exchange
-      case _: Exchange => false
-      case plan =>
-        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
-        // and all query stages are materialized.
-        plan.find {
-          case p if p.children.exists(
-            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
-            p match {
-              case stage: QueryStageExec if stage.isMaterialized => false
-              case _ => true
-            }
-          case _ => false
-        }.isEmpty
-    }
-  }
-
   /**
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
@@ -746,14 +697,9 @@ case class AdaptiveSparkPlanExec(
   }
 
   private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = {
-    val preparationWithExtraShuffleRules = if (isFinalStage(sparkPlan)) {
-      finalStagePreparationWithExtraShuffleRules
-    } else {
-      queryStagePreparationWithExtraShuffleRules
-    }
     applyPhysicalRules(
       sparkPlan,
-      preparationWithExtraShuffleRules,
+      queryStagePreparationWithExtraShuffleRules,
       Some((planChangeLogger, "AQE Replanning")))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
index c2e9642171588..892b8a9749677 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
@@ -48,13 +48,13 @@ case class SkewJoinAwareCost(
  * A skew join aware implementation of [[CostEvaluator]], which counts the number of
  * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan.
  */
-case class SkewJoinAwareCostEvaluator(forceOptimizeSkewJoin: Boolean) extends CostEvaluator {
+case class SkewJoinAwareCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator {
   override def evaluateCost(plan: SparkPlan): Cost = {
     val shuffleNumber = plan.collect {
       case s: ShuffleExchangeLike => s
     }.size
 
-    if (forceOptimizeSkewJoin) {
+    if (forceOptimizeSkewedJoin) {
       val skewJoinNumber = plan.collect {
         case j: ShuffledJoin if j.isSkewJoin => j
       }.size

From 30b7de03dc20ab8f2d9aaa07ed37a18d6081f672 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 3 Aug 2021 19:41:53 +0800
Subject: [PATCH 09/37] style

---
 .../apache/spark/sql/internal/SQLConf.scala   |  2 +-
 .../adaptive/AdaptiveSparkPlanExec.scala      |  2 +-
 .../adaptive/OptimizeSkewedJoin.scala         | 20 +++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 71d5928c24a74..70097ab1518d9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -680,7 +680,7 @@ object SQLConf {
 
   val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN =
     buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin")
-      .doc("When true, force enable OptimizeSkewJoin even if it introduces extra shuffle.")
+      .doc("When true, force enable OptimizeSkewedJoin even if it introduces extra shuffle.")
       .version("3.2.0")
       .booleanConf
       .createWithDefault(false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index c719e6c4a6524..cc63000c88ce4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -138,7 +138,7 @@ case class AdaptiveSparkPlanExec(
   @transient private val postStageCreationRules = Seq(
     ApplyColumnarRulesAndInsertTransitions(context.session.sessionState.columnarRules),
     CollapseCodegenStages()
-  )
+  ) ++ context.session.sessionState.postStageCreationRules
 
   private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
     val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 2c423c8a4189a..dcdc67360f98c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -114,9 +114,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule {
    *    3 tasks separately.
    */
   private def tryOptimizeJoinChildren(
-                                       left: ShuffleQueryStageExec,
-                                       right: ShuffleQueryStageExec,
-                                       joinType: JoinType): Option[(SparkPlan, SparkPlan)] = {
+      left: ShuffleQueryStageExec,
+      right: ShuffleQueryStageExec,
+      joinType: JoinType): Option[(SparkPlan, SparkPlan)] = {
     val canSplitLeft = canSplitLeftSide(joinType)
     val canSplitRight = canSplitRightSide(joinType)
     if (!canSplitLeft && !canSplitRight) return None
@@ -202,9 +202,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule {
   }
 
   def optimizeSkewJoin(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case smj@SortMergeJoinExec(_, _, joinType, _,
-    s1@SortExec(_, _, ShuffleStage(left: ShuffleQueryStageExec), _),
-    s2@SortExec(_, _, ShuffleStage(right: ShuffleQueryStageExec), _), false) =>
+    case smj @ SortMergeJoinExec(_, _, joinType, _,
+        s1 @ SortExec(_, _, ShuffleStage(left: ShuffleQueryStageExec), _),
+        s2 @ SortExec(_, _, ShuffleStage(right: ShuffleQueryStageExec), _), false) =>
       val newChildren = tryOptimizeJoinChildren(left, right, joinType)
       if (newChildren.isDefined) {
         val (newLeft, newRight) = newChildren.get
@@ -214,9 +214,9 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule {
         smj
       }
 
-    case shj@ShuffledHashJoinExec(_, _, joinType, _, _,
-    ShuffleStage(left: ShuffleQueryStageExec),
-    ShuffleStage(right: ShuffleQueryStageExec), false) =>
+    case shj @ ShuffledHashJoinExec(_, _, joinType, _, _,
+        ShuffleStage(left: ShuffleQueryStageExec),
+        ShuffleStage(right: ShuffleQueryStageExec), false) =>
       val newChildren = tryOptimizeJoinChildren(left, right, joinType)
       if (newChildren.isDefined) {
         val (newLeft, newRight) = newChildren.get
@@ -256,7 +256,7 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule {
     }
   }
 
-  private object ShuffleStage {
+  object ShuffleStage {
     def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match {
       case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined &&
         isSupported(s.shuffle) =>

From 6caa4a301d3836d34bfc99f02fcb7decb2ce82aa Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 3 Aug 2021 20:13:45 +0800
Subject: [PATCH 10/37] name

---
 .../spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index a414e6a44129f..fe4a2f3e82408 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1925,14 +1925,14 @@ class AdaptiveQueryExecSuite
           "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1")
         val smj = findTopLevelSortMergeJoin(adaptive)
         assert(smj.size == 1 && smj.forall(_.isSkewJoin))
-        checkNumLocalShuffleReaders(adaptive, 3)
+        checkNumLocalShuffleReads(adaptive, 3)
 
         val (_, adaptive2) = runAdaptiveAndVerifyResult(
           "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
         val smj2 = findTopLevelSortMergeJoin(adaptive2)
         assert(smj2.size == 1 && smj2.forall(_.isSkewJoin))
         // top level shuffle reader is local
-        checkNumLocalShuffleReaders(adaptive2, 2)
+        checkNumLocalShuffleReads(adaptive2, 2)
       }
     }
   }

From cd1a37992e3fd80d3ec6ee808a41dd2fa6233350 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 13 Aug 2021 11:55:18 +0800
Subject: [PATCH 11/37] final stage

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 38 +++++++++++++++----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index cc63000c88ce4..b2d87f1d5c5de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._
 import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan
 import org.apache.spark.sql.execution.exchange._
-import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric}
+import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SQLPlanMetric, SparkListenerSQLAdaptiveSQLMetricUpdates}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
@@ -118,7 +118,7 @@ case class AdaptiveSparkPlanExec(
     OptimizeSkewedJoin,
     // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
     // ensure the output partitioning of OptimizeSkewedJoin is always expected.
-    EnsureRequirements(false)
+    EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)
   )
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
@@ -140,8 +140,9 @@ case class AdaptiveSparkPlanExec(
     CollapseCodegenStages()
   ) ++ context.session.sessionState.postStageCreationRules
 
-  private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
-    val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
+  private def optimizeStage(
+      rules: Seq[Rule[SparkPlan]], plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
+    val optimized = rules.foldLeft(plan) { case (latestPlan, rule) =>
       val applied = rule.apply(latestPlan)
       val result = rule match {
         case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) =>
@@ -169,6 +170,10 @@ case class AdaptiveSparkPlanExec(
     optimized
   }
 
+  private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
+    optimizeStage(queryStageOptimizerRules, plan, isFinalStage)
+  }
+
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
@@ -696,11 +701,30 @@ case class AdaptiveSparkPlanExec(
     (finalPlan, optimized)
   }
 
+  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
+    sparkPlan match {
+      // avoid top level node is Exchange
+      case _: Exchange => false
+      case plan =>
+        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
+        // and all query stages are materialized.
+        plan.find {
+          case p if p.children.exists(
+            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
+            p match {
+              case stage: QueryStageExec if stage.isMaterialized => false
+              case _ => true
+            }
+          case _ => false
+        }.isEmpty
+    }
+  }
+
   private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = {
-    applyPhysicalRules(
-      sparkPlan,
+    optimizeStage(
       queryStagePreparationWithExtraShuffleRules,
-      Some((planChangeLogger, "AQE Replanning")))
+      sparkPlan,
+      isFinalStage(sparkPlan))
   }
 
   /**

From 2b3bfe6d04456b0eb7df083964067dec5539ac38 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 13 Aug 2021 11:56:12 +0800
Subject: [PATCH 12/37] style

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index b2d87f1d5c5de..d888a20973ebc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._
 import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan
 import org.apache.spark.sql.execution.exchange._
-import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SQLPlanMetric, SparkListenerSQLAdaptiveSQLMetricUpdates}
+import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils

From 6725f975ca9f15eb8f233381fcfa9fc88cdef760 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 13 Aug 2021 14:35:01 +0800
Subject: [PATCH 13/37] checkDistribution

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 59 +++++++++++--------
 .../adaptive/AdaptiveQueryExecSuite.scala     |  5 ++
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 6b41529c8a394..3ccaa8749cabd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -113,7 +113,7 @@ case class AdaptiveSparkPlanExec(
 
   // This list rules are applied between queryStagePreparationRules and estimate physical plan cost
   // so that we can support introduce extra shuffle
-  @transient private val queryStagePreparationWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq(
+  @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq(
     // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of
     // skew join and extra shuffle nodes.
     OptimizeSkewedJoin,
@@ -147,27 +147,14 @@ case class AdaptiveSparkPlanExec(
     collapseCodegenStagesRule
   )
 
-  private def optimizeStage(
-      rules: Seq[Rule[SparkPlan]], plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
-    val optimized = rules.foldLeft(plan) { case (latestPlan, rule) =>
+  private def optimizeQueryStage(
+      plan: SparkPlan,
+      isFinalStage: Boolean): SparkPlan = context.qe.withCteMap {
+    val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
       val applied = rule.apply(latestPlan)
       val result = rule match {
         case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) =>
-          val distribution = if (isFinalStage) {
-            // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize
-            // out the user-specified repartition, thus we don't have a distribution requirement
-            // for the final plan.
-            requiredDistribution.getOrElse(UnspecifiedDistribution)
-          } else {
-            UnspecifiedDistribution
-          }
-          if (ValidateRequirements.validate(applied, distribution)) {
-            applied
-          } else {
-            logDebug(s"Rule ${rule.ruleName} is not applied as it breaks the " +
-              "distribution requirement of the query plan.")
-            latestPlan
-          }
+          checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName)
         case _ => applied
       }
       planChangeLogger.logRule(rule.ruleName, latestPlan, result)
@@ -177,8 +164,26 @@ case class AdaptiveSparkPlanExec(
     optimized
   }
 
-  private def optimizeQueryStage(plan: SparkPlan, isFinalStage: Boolean): SparkPlan = {
-    optimizeStage(queryStageOptimizerRules, plan, isFinalStage)
+  private def checkDistribution(
+      newPlan: SparkPlan,
+      originPlan: SparkPlan,
+      isFinalStage: Boolean,
+      ruleName: String): SparkPlan = {
+    val distribution = if (isFinalStage) {
+      // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize
+      // out the user-specified repartition, thus we don't have a distribution requirement
+      // for the final plan.
+      requiredDistribution.getOrElse(UnspecifiedDistribution)
+    } else {
+      UnspecifiedDistribution
+    }
+    if (ValidateRequirements.validate(newPlan, distribution)) {
+      newPlan
+    } else {
+      logDebug(s"Rule $ruleName is not applied as it breaks the " +
+        "distribution requirement of the query plan.")
+      originPlan
+    }
   }
 
   @transient private val costEvaluator =
@@ -311,7 +316,7 @@ case class AdaptiveSparkPlanExec(
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
         val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
-        val planWithExtraShuffle = rePlanWithExtraShuffle(reOptimizationPhysicalPlan)
+        val planWithExtraShuffle = optimizeSkewedJoin(reOptimizationPhysicalPlan)
         val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
         val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan)
         val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle)
@@ -706,11 +711,13 @@ case class AdaptiveSparkPlanExec(
     }
   }
 
-  private def rePlanWithExtraShuffle(sparkPlan: SparkPlan): SparkPlan = {
-    optimizeStage(
-      queryStagePreparationWithExtraShuffleRules,
+  private def optimizeSkewedJoin(sparkPlan: SparkPlan): SparkPlan = {
+    val optimized = applyPhysicalRules(
       sparkPlan,
-      isFinalStage(sparkPlan))
+      optimizeSkewedJoinWithExtraShuffleRules,
+      Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle"))
+    )
+    checkDistribution(optimized, sparkPlan, isFinalStage(optimized), OptimizeSkewedJoin.ruleName)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 8e838542ddd74..438529aad0e6f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1940,6 +1940,11 @@ class AdaptiveQueryExecSuite
         assert(smj2.size == 1 && smj2.forall(_.isSkewJoin))
         // top level shuffle reader is local
         checkNumLocalShuffleReads(adaptive2, 2)
+
+        val (_, adaptive3) = runAdaptiveAndVerifyResult(
+          "SELECT /*+ repartition(key1) */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
+        val smj3 = findTopLevelSortMergeJoin(adaptive3)
+        assert(smj3.size == 1 && !smj3.exists(_.isSkewJoin))
       }
     }
   }

From 7a0448b0a80b52bf226f219bea515a5d0f7e96c4 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 19 Aug 2021 19:17:00 +0800
Subject: [PATCH 14/37] SimpleCostEvaluator

---
 .../apache/spark/sql/internal/SQLConf.scala   |  2 +-
 .../adaptive/AdaptiveSparkPlanExec.scala      |  2 +-
 .../adaptive/SkewJoinAwareCost.scala          | 66 -------------------
 .../execution/adaptive/simpleCosting.scala    | 40 +++++++++--
 4 files changed, 37 insertions(+), 73 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 9c84add520249..0020faed79ceb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -680,7 +680,7 @@ object SQLConf {
   val ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN =
     buildConf("spark.sql.adaptive.forceOptimizeSkewedJoin")
       .doc("When true, force enable OptimizeSkewedJoin even if it introduces extra shuffle.")
-      .version("3.2.0")
+      .version("3.3.0")
       .booleanConf
       .createWithDefault(false)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 3ccaa8749cabd..ffac595d685ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -190,7 +190,7 @@ case class AdaptiveSparkPlanExec(
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
       case _ =>
-        SkewJoinAwareCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
+        SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
     }
 
   @transient val initialPlan = context.session.withActive {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
deleted file mode 100644
index 892b8a9749677..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewJoinAwareCost.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.adaptive
-
-import org.apache.spark.sql.errors.QueryExecutionErrors
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
-import org.apache.spark.sql.execution.joins.ShuffledJoin
-
-/**
- * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number
- */
-case class SkewJoinAwareCost(
-    numShuffles: Int,
-    numSkewJoins: Int) extends Cost {
-  override def compare(that: Cost): Int = that match {
-    case other: SkewJoinAwareCost =>
-      if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) {
-        // If more skew joins are optimized or less shuffle nodes, it means the cost is lower
-        -1
-      } else if (numShuffles > other.numShuffles) {
-        1
-      } else {
-        0
-      }
-
-    case _ =>
-      throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString)
-  }
-}
-
-/**
- * A skew join aware implementation of [[CostEvaluator]], which counts the number of
- * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan.
- */
-case class SkewJoinAwareCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator {
-  override def evaluateCost(plan: SparkPlan): Cost = {
-    val shuffleNumber = plan.collect {
-      case s: ShuffleExchangeLike => s
-    }.size
-
-    if (forceOptimizeSkewedJoin) {
-      val skewJoinNumber = plan.collect {
-        case j: ShuffledJoin if j.isSkewJoin => j
-      }.size
-      SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
-    } else {
-      SimpleCost(shuffleNumber)
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
index 7f026835dcf48..8b26d84f43ea8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.adaptive
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
+import org.apache.spark.sql.execution.joins.ShuffledJoin
 
 /**
  * A simple implementation of [[Cost]], which takes a number of [[Long]] as the cost value.
@@ -35,15 +36,44 @@ case class SimpleCost(value: Long) extends Cost {
 }
 
 /**
- * A simple implementation of [[CostEvaluator]], which counts the number of
- * [[ShuffleExchangeLike]] nodes in the plan.
+ * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number
  */
-object SimpleCostEvaluator extends CostEvaluator {
+case class SkewJoinAwareCost(
+    numShuffles: Int,
+    numSkewJoins: Int) extends Cost {
+  override def compare(that: Cost): Int = that match {
+    case other: SkewJoinAwareCost =>
+      if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) {
+        // If more skew joins are optimized or less shuffle nodes, it means the cost is lower
+        -1
+      } else if (numShuffles > other.numShuffles) {
+        1
+      } else {
+        0
+      }
 
+    case _ =>
+      throw QueryExecutionErrors.cannotCompareCostWithTargetCostError(that.toString)
+  }
+}
+
+/**
+ * A skew join aware implementation of [[CostEvaluator]], which counts the number of
+ * [[ShuffleExchangeLike]] nodes and skew join nodes in the plan.
+ */
+case class SimpleCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator {
   override def evaluateCost(plan: SparkPlan): Cost = {
-    val cost = plan.collect {
+    val shuffleNumber = plan.collect {
       case s: ShuffleExchangeLike => s
     }.size
-    SimpleCost(cost)
+
+    if (forceOptimizeSkewedJoin) {
+      val skewJoinNumber = plan.collect {
+        case j: ShuffledJoin if j.isSkewJoin => j
+      }.size
+      SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
+    } else {
+      SimpleCost(shuffleNumber)
+    }
   }
 }

From 60b7b9da8f268f15706c30462976ffc389c95c45 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 19 Aug 2021 20:44:11 +0800
Subject: [PATCH 15/37] address comment

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 97 +++++++++++--------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index ffac595d685ce..d77e90074b4f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -80,6 +80,9 @@ case class AdaptiveSparkPlanExec(
     case _ => logDebug(_)
   }
 
+  @transient private val forceOptimizeSkewedJoin =
+    conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)
+
   @transient private val planChangeLogger = new PlanChangeLogger[SparkPlan]()
 
   // The logical plan optimizer for re-optimizing the current logical plan.
@@ -113,14 +116,16 @@ case class AdaptiveSparkPlanExec(
 
   // This list rules are applied between queryStagePreparationRules and estimate physical plan cost
   // so that we can support introduce extra shuffle
-  @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = Seq(
-    // Apply OptimizeSkewedJoin rule at preparation side so that we can compare the cost of
-    // skew join and extra shuffle nodes.
-    OptimizeSkewedJoin,
-    // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
-    // ensure the output partitioning of OptimizeSkewedJoin is always expected.
-    EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined)
-  )
+  @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = {
+    val ensureExtraShuffleRule = if (forceOptimizeSkewedJoin) {
+      // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
+      // ensure the output partitioning of OptimizeSkewedJoin is always expected.
+      Seq(EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined))
+    } else {
+      Nil
+    }
+    Seq(OptimizeSkewedJoin) ++ ensureExtraShuffleRule
+  }
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
   // optimizations should be stage-independent.
@@ -153,7 +158,7 @@ case class AdaptiveSparkPlanExec(
     val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
       val applied = rule.apply(latestPlan)
       val result = rule match {
-        case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) =>
+        case _: AQEShuffleReadRule =>
           checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName)
         case _ => applied
       }
@@ -169,6 +174,7 @@ case class AdaptiveSparkPlanExec(
       originPlan: SparkPlan,
       isFinalStage: Boolean,
       ruleName: String): SparkPlan = {
+    if (newPlan.fastEquals(originPlan)) return originPlan
     val distribution = if (isFinalStage) {
       // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize
       // out the user-specified repartition, thus we don't have a distribution requirement
@@ -189,8 +195,7 @@ case class AdaptiveSparkPlanExec(
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
-      case _ =>
-        SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
+      case _ => SimpleCostEvaluator(forceOptimizeSkewedJoin)
     }
 
   @transient val initialPlan = context.session.withActive {
@@ -315,26 +320,25 @@ case class AdaptiveSparkPlanExec(
         // plans are updated, we can clear the query stage list because at this point the two plans
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
-        val (reOptimizationPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
-        val planWithExtraShuffle = optimizeSkewedJoin(reOptimizationPhysicalPlan)
-        val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
-        val reOptimizationCost = costEvaluator.evaluateCost(reOptimizationPhysicalPlan)
-        val extraShuffleCost = costEvaluator.evaluateCost(planWithExtraShuffle)
-        def updateCurrentPlan(newPhysicalPlan: SparkPlan): Unit = {
+        val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan)
+        val newPhysicalPlan =
+          (Seq(currentPhysicalPlan) ++ newPhysicalPlans)
+            .map(plan => (plan, costEvaluator.evaluateCost(plan)))
+            .reduce { (last, current) =>
+              if (current._2 < last._2 || (current._2 == last._2  && current._1 != last._1)) {
+                current
+              } else {
+                last
+              }
+            }._1
+
+        if (!newPhysicalPlan.fastEquals(currentPhysicalPlan)) {
           logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan")
           cleanUpTempTags(newPhysicalPlan)
           currentPhysicalPlan = newPhysicalPlan
           currentLogicalPlan = newLogicalPlan
           stagesToReplace = Seq.empty[QueryStageExec]
         }
-
-        if (extraShuffleCost < reOptimizationCost || (extraShuffleCost == reOptimizationCost &&
-          reOptimizationPhysicalPlan != planWithExtraShuffle)) {
-          updateCurrentPlan(planWithExtraShuffle)
-        } else if (reOptimizationCost < origCost ||
-          (reOptimizationCost == origCost && currentPhysicalPlan != reOptimizationPhysicalPlan)) {
-          updateCurrentPlan(reOptimizationPhysicalPlan)
-        }
         // Now that some stages have finished, we can try creating new stages.
         result = createQueryStages(currentPhysicalPlan)
       }
@@ -667,11 +671,11 @@ case class AdaptiveSparkPlanExec(
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
   private def reOptimize(
-      logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = context.qe.withCteMap {
+      logicalPlan: LogicalPlan): (Seq[SparkPlan], LogicalPlan) = context.qe.withCteMap {
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-    val newPlan = applyPhysicalRules(
+    val optimizedPhysicalPlan = applyPhysicalRules(
       sparkPlan,
       preprocessingRules ++ queryStagePreparationRules,
       Some((planChangeLogger, "AQE Replanning")))
@@ -683,13 +687,35 @@ case class AdaptiveSparkPlanExec(
     // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery.
     // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan
     // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
-    val finalPlan = currentPhysicalPlan match {
+    def updateBroadcastExchange(plan: SparkPlan): SparkPlan = currentPhysicalPlan match {
       case b: BroadcastExchangeLike
-        if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
-      case _ => newPlan
+        if (!plan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(plan))
+      case _ => plan
     }
 
-    (finalPlan, optimized)
+    val optimizedWithSkewedJoin = applyPhysicalRules(
+      optimizedPhysicalPlan,
+      optimizeSkewedJoinWithExtraShuffleRules,
+      Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle"))
+    )
+    val validatedWithSkewedJoin =
+      checkDistribution(
+        optimizedWithSkewedJoin,
+        optimizedPhysicalPlan,
+        isFinalStage(optimizedWithSkewedJoin),
+        OptimizeSkewedJoin.ruleName)
+
+    // here are three reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan:
+    // 1. no skewed join optimized
+    // 2. optimize skewed join introduce extra shuffle and force optimize is disabled
+    // 3. optimize skewed join change final stage output partitioning
+    val newPhysicalPlans = if (validatedWithSkewedJoin.fastEquals(optimizedPhysicalPlan)) {
+      updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
+    } else {
+      updateBroadcastExchange(optimizedPhysicalPlan) ::
+        updateBroadcastExchange(validatedWithSkewedJoin) :: Nil
+    }
+    (newPhysicalPlans, optimized)
   }
 
   private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
@@ -711,15 +737,6 @@ case class AdaptiveSparkPlanExec(
     }
   }
 
-  private def optimizeSkewedJoin(sparkPlan: SparkPlan): SparkPlan = {
-    val optimized = applyPhysicalRules(
-      sparkPlan,
-      optimizeSkewedJoinWithExtraShuffleRules,
-      Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle"))
-    )
-    checkDistribution(optimized, sparkPlan, isFinalStage(optimized), OptimizeSkewedJoin.ruleName)
-  }
-
   /**
    * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node.
    */

From fbf9727f9f1add7447005e1c4bc41697cc512e1b Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 19 Aug 2021 21:52:58 +0800
Subject: [PATCH 16/37] cost

---
 .../spark/sql/execution/adaptive/simpleCosting.scala      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
index 8b26d84f43ea8..621fb01832ccd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
@@ -43,8 +43,12 @@ case class SkewJoinAwareCost(
     numSkewJoins: Int) extends Cost {
   override def compare(that: Cost): Int = that match {
     case other: SkewJoinAwareCost =>
-      if (numSkewJoins > other.numSkewJoins || numShuffles < other.numShuffles) {
-        // If more skew joins are optimized or less shuffle nodes, it means the cost is lower
+      // If more skew joins are optimized or less shuffle nodes, it means the cost is lower
+      if (numSkewJoins > other.numSkewJoins) {
+        -1
+      } else if (numSkewJoins < other.numSkewJoins) {
+        1
+      } else if (numShuffles < other.numShuffles) {
         -1
       } else if (numShuffles > other.numShuffles) {
         1

From b54e9c232a43ce05e320d5459b16b770261ee38b Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 20 Aug 2021 12:26:02 +0800
Subject: [PATCH 17/37] plan twice

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 114 +++++++++---------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index d77e90074b4f7..2e6585bd5b818 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -103,28 +103,27 @@ case class AdaptiveSparkPlanExec(
   // A list of physical plan rules to be applied before creation of query stages. The physical
   // plan should reach a final status of query stages (i.e., no more addition or removal of
   // Exchange nodes) after running these rules.
-  @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
-    RemoveRedundantProjects,
-    // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for
-    // the final plan, but we do need to respect the user-specified repartition. Here we ask
-    // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
-    // around this case.
-    EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
-    RemoveRedundantSorts,
-    DisableUnnecessaryBucketedScan
-  ) ++ context.session.sessionState.queryStagePrepRules
-
-  // This list rules are applied between queryStagePreparationRules and estimate physical plan cost
-  // so that we can support introduce extra shuffle
-  @transient private val optimizeSkewedJoinWithExtraShuffleRules: Seq[Rule[SparkPlan]] = {
-    val ensureExtraShuffleRule = if (forceOptimizeSkewedJoin) {
-      // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
-      // ensure the output partitioning of OptimizeSkewedJoin is always expected.
-      Seq(EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined))
+  private def queryStagePreparationRules(
+      optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = {
+    val optimizeSkewedJoinRules = if (optimizeSkewedJoin) {
+      Seq(OptimizeSkewedJoin,
+        // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
+        // ensure the output partitioning of OptimizeSkewedJoin is always expected.
+        EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined))
     } else {
       Nil
     }
-    Seq(OptimizeSkewedJoin) ++ ensureExtraShuffleRule
+
+    Seq(
+      RemoveRedundantProjects,
+      // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for
+      // the final plan, but we do need to respect the user-specified repartition. Here we ask
+      // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
+      // around this case.
+      EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
+      RemoveRedundantSorts,
+      DisableUnnecessaryBucketedScan
+    ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules
   }
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
@@ -200,7 +199,7 @@ case class AdaptiveSparkPlanExec(
 
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(
-      inputPlan, queryStagePreparationRules, Some((planChangeLogger, "AQE Preparations")))
+      inputPlan, queryStagePreparationRules(), Some((planChangeLogger, "AQE Preparations")))
   }
 
   @volatile private var currentPhysicalPlan = initialPlan
@@ -332,7 +331,7 @@ case class AdaptiveSparkPlanExec(
               }
             }._1
 
-        if (!newPhysicalPlan.fastEquals(currentPhysicalPlan)) {
+        if (newPhysicalPlan.ne(currentPhysicalPlan)) {
           logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan")
           cleanUpTempTags(newPhysicalPlan)
           currentPhysicalPlan = newPhysicalPlan
@@ -667,6 +666,25 @@ case class AdaptiveSparkPlanExec(
     logicalPlan
   }
 
+  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
+    sparkPlan match {
+      // avoid top level node is Exchange
+      case _: Exchange => false
+      case plan =>
+        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
+        // and all query stages are materialized.
+        plan.find {
+          case p if p.children.exists(
+            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
+            p match {
+              case stage: QueryStageExec if stage.isMaterialized => false
+              case _ => true
+            }
+          case _ => false
+        }.isEmpty
+    }
+  }
+
   /**
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
@@ -675,11 +693,25 @@ case class AdaptiveSparkPlanExec(
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
+
     val optimizedPhysicalPlan = applyPhysicalRules(
       sparkPlan,
-      preprocessingRules ++ queryStagePreparationRules,
+      preprocessingRules ++ queryStagePreparationRules(),
       Some((planChangeLogger, "AQE Replanning")))
 
+    val optimizedWithSkewedJoin = applyPhysicalRules(
+      sparkPlan,
+      preprocessingRules ++ queryStagePreparationRules(true),
+      Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join")))
+
+    // respect the requiredDistribution for final stage
+    val validatedWithSkewedJoin =
+      checkDistribution(
+        optimizedWithSkewedJoin,
+        optimizedPhysicalPlan,
+        isFinalStage(optimizedWithSkewedJoin),
+        OptimizeSkewedJoin.ruleName)
+
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
     // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated
@@ -693,23 +725,10 @@ case class AdaptiveSparkPlanExec(
       case _ => plan
     }
 
-    val optimizedWithSkewedJoin = applyPhysicalRules(
-      optimizedPhysicalPlan,
-      optimizeSkewedJoinWithExtraShuffleRules,
-      Some((planChangeLogger, "AQE Optimize Skewed Join With Extra Shuffle"))
-    )
-    val validatedWithSkewedJoin =
-      checkDistribution(
-        optimizedWithSkewedJoin,
-        optimizedPhysicalPlan,
-        isFinalStage(optimizedWithSkewedJoin),
-        OptimizeSkewedJoin.ruleName)
-
-    // here are three reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan:
+    // here are two reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan:
     // 1. no skewed join optimized
-    // 2. optimize skewed join introduce extra shuffle and force optimize is disabled
-    // 3. optimize skewed join change final stage output partitioning
-    val newPhysicalPlans = if (validatedWithSkewedJoin.fastEquals(optimizedPhysicalPlan)) {
+    // 2. optimize skewed join doesn't satisfy requiredDistribution for final stage
+    val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(validatedWithSkewedJoin)) {
       updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
     } else {
       updateBroadcastExchange(optimizedPhysicalPlan) ::
@@ -718,25 +737,6 @@ case class AdaptiveSparkPlanExec(
     (newPhysicalPlans, optimized)
   }
 
-  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
-    sparkPlan match {
-      // avoid top level node is Exchange
-      case _: Exchange => false
-      case plan =>
-        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
-        // and all query stages are materialized.
-        plan.find {
-          case p if p.children.exists(
-            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
-            p match {
-              case stage: QueryStageExec if stage.isMaterialized => false
-              case _ => true
-            }
-          case _ => false
-        }.isEmpty
-    }
-  }
-
   /**
    * Recursively set `TEMP_LOGICAL_PLAN_TAG` for the current `plan` node.
    */

From f5ad40e7ef8fafc4e084ef15d7956d0b3eaf214e Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 20 Aug 2021 12:36:07 +0800
Subject: [PATCH 18/37] nit

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 2e6585bd5b818..1d51950fdff0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -80,9 +80,6 @@ case class AdaptiveSparkPlanExec(
     case _ => logDebug(_)
   }
 
-  @transient private val forceOptimizeSkewedJoin =
-    conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN)
-
   @transient private val planChangeLogger = new PlanChangeLogger[SparkPlan]()
 
   // The logical plan optimizer for re-optimizing the current logical plan.
@@ -194,7 +191,7 @@ case class AdaptiveSparkPlanExec(
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
-      case _ => SimpleCostEvaluator(forceOptimizeSkewedJoin)
+      case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
     }
 
   @transient val initialPlan = context.session.withActive {

From 8058fe9354ed28b32f0ea0bdc8c9243f79c58fba Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 20 Aug 2021 12:41:20 +0800
Subject: [PATCH 19/37] nit

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 1d51950fdff0a..72f88310de81f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -104,8 +104,8 @@ case class AdaptiveSparkPlanExec(
       optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = {
     val optimizeSkewedJoinRules = if (optimizeSkewedJoin) {
       Seq(OptimizeSkewedJoin,
-        // Add the EnsureRequirements rule here and don't optimize out repartition so that we can
-        // ensure the output partitioning of OptimizeSkewedJoin is always expected.
+        // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change
+        // output partitioning, make sure we have right distribution.
         EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined))
     } else {
       Nil

From 369bf33c27e940cced13e72feb54fc40f945f20e Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 11:13:31 +0800
Subject: [PATCH 20/37] ensureRequiredDistribution

---
 .../sql/execution/adaptive/AQEUtils.scala     | 19 +++++-
 .../adaptive/AdaptiveSparkPlanExec.scala      | 49 ++++++---------
 .../adaptive/AdaptiveQueryExecSuite.scala     | 62 ++++++++++++++-----
 3 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
index 277af212d81f3..e2d19e8fed730 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql.execution.adaptive
 
+import org.apache.spark.sql.catalyst.SQLConfHelper
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
 
-object AQEUtils {
+object AQEUtils extends SQLConfHelper {
 
   // Analyze the given plan and calculate the required distribution of this plan w.r.t. the
   // user-specified repartition.
@@ -57,4 +58,20 @@ object AQEUtils {
       }
     case _ => Some(UnspecifiedDistribution)
   }
+
+  // Add an extra shuffle if input plan does not satisfy the required distribution.
+  // This method is invoked after optimizing skewed join in case we change final stage
+  // output partitioning.
+  def ensureRequiredDistribution(
+      plan: SparkPlan, distribution: Option[Distribution]): SparkPlan = distribution match {
+    case Some(d) if !plan.outputPartitioning.satisfies(d) =>
+      val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions)
+      val shuffleOrigin = if (d.requiredNumPartitions.isDefined) {
+        REPARTITION_BY_NUM
+      } else {
+        REPARTITION_BY_COL
+      }
+      ShuffleExchangeExec(d.createPartitioning(numPartitions), plan, shuffleOrigin)
+    case _ => plan
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 72f88310de81f..1831e73fd085d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -154,8 +154,22 @@ case class AdaptiveSparkPlanExec(
     val optimized = queryStageOptimizerRules.foldLeft(plan) { case (latestPlan, rule) =>
       val applied = rule.apply(latestPlan)
       val result = rule match {
-        case _: AQEShuffleReadRule =>
-          checkDistribution(applied, latestPlan, isFinalStage, rule.ruleName)
+        case _: AQEShuffleReadRule if !applied.fastEquals(latestPlan) =>
+          val distribution = if (isFinalStage) {
+            // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize
+            // out the user-specified repartition, thus we don't have a distribution requirement
+            // for the final plan.
+            requiredDistribution.getOrElse(UnspecifiedDistribution)
+          } else {
+            UnspecifiedDistribution
+          }
+          if (ValidateRequirements.validate(applied, distribution)) {
+            applied
+          } else {
+            logDebug(s"Rule ${rule.ruleName} is not applied as it breaks the " +
+              "distribution requirement of the query plan.")
+            latestPlan
+          }
         case _ => applied
       }
       planChangeLogger.logRule(rule.ruleName, latestPlan, result)
@@ -165,29 +179,6 @@ case class AdaptiveSparkPlanExec(
     optimized
   }
 
-  private def checkDistribution(
-      newPlan: SparkPlan,
-      originPlan: SparkPlan,
-      isFinalStage: Boolean,
-      ruleName: String): SparkPlan = {
-    if (newPlan.fastEquals(originPlan)) return originPlan
-    val distribution = if (isFinalStage) {
-      // If `requiredDistribution` is None, it means `EnsureRequirements` will not optimize
-      // out the user-specified repartition, thus we don't have a distribution requirement
-      // for the final plan.
-      requiredDistribution.getOrElse(UnspecifiedDistribution)
-    } else {
-      UnspecifiedDistribution
-    }
-    if (ValidateRequirements.validate(newPlan, distribution)) {
-      newPlan
-    } else {
-      logDebug(s"Rule $ruleName is not applied as it breaks the " +
-        "distribution requirement of the query plan.")
-      originPlan
-    }
-  }
-
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
@@ -701,13 +692,9 @@ case class AdaptiveSparkPlanExec(
       preprocessingRules ++ queryStagePreparationRules(true),
       Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join")))
 
-    // respect the requiredDistribution for final stage
+    // ensure the output partitioning for requiredDistribution
     val validatedWithSkewedJoin =
-      checkDistribution(
-        optimizedWithSkewedJoin,
-        optimizedPhysicalPlan,
-        isFinalStage(optimizedWithSkewedJoin),
-        OptimizeSkewedJoin.ruleName)
+      AQEUtils.ensureRequiredDistribution(optimizedWithSkewedJoin, requiredDistribution)
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 438529aad0e6f..b50d271e41428 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -96,6 +96,12 @@ class AdaptiveQueryExecSuite
     (dfAdaptive.queryExecution.sparkPlan, adaptivePlan)
   }
 
+  private def findTopLevelShuffle(plan: SparkPlan): Seq[ShuffleExchangeExec] = {
+    collect(plan) {
+      case s: ShuffleExchangeExec => s
+    }
+  }
+
   private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = {
     collect(plan) {
       case j: BroadcastHashJoinExec => j
@@ -1928,23 +1934,45 @@ class AdaptiveQueryExecSuite
           .selectExpr("id % 1 as key2", "id as value2")
           .createOrReplaceTempView("skewData2")
 
-        val (_, adaptive) = runAdaptiveAndVerifyResult(
-          "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1")
-        val smj = findTopLevelSortMergeJoin(adaptive)
-        assert(smj.size == 1 && smj.forall(_.isSkewJoin))
-        checkNumLocalShuffleReads(adaptive, 3)
-
-        val (_, adaptive2) = runAdaptiveAndVerifyResult(
-          "SELECT /*+ repartition */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
-        val smj2 = findTopLevelSortMergeJoin(adaptive2)
-        assert(smj2.size == 1 && smj2.forall(_.isSkewJoin))
-        // top level shuffle reader is local
-        checkNumLocalShuffleReads(adaptive2, 2)
-
-        val (_, adaptive3) = runAdaptiveAndVerifyResult(
-          "SELECT /*+ repartition(key1) */ key1 FROM skewData1 JOIN skewData2 ON key1 = key2")
-        val smj3 = findTopLevelSortMergeJoin(adaptive3)
-        assert(smj3.size == 1 && !smj3.exists(_.isSkewJoin))
+        // check if optimized skewed join does not satisfy the required distribution
+        Seq(true, false).foreach { hasRequiredDistribution =>
+          Seq(true, false).foreach { hasPartitionNumber =>
+            val repartition = if (hasRequiredDistribution) {
+              s"/*+ repartition(${ if (hasPartitionNumber) "10," else ""}key1) */"
+            } else {
+              ""
+            }
+
+            // check required distribution and extra shuffle
+            val (_, adaptive1) =
+              runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " +
+                s"JOIN skewData2 ON key1 = key2 GROUP BY key1")
+            val shuffles1 = findTopLevelShuffle(adaptive1)
+            assert(shuffles1.size == 3)
+            assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS)
+            val smj1 = findTopLevelSortMergeJoin(adaptive1)
+            assert(smj1.size == 1 && smj1.exists(_.isSkewJoin))
+
+            // only check required distribution
+            val (_, adaptive2) =
+              runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " +
+                s"JOIN skewData2 ON key1 = key2")
+            val shuffles2 = findTopLevelShuffle(adaptive2)
+            if (hasRequiredDistribution) {
+              assert(shuffles2.size == 3)
+              val finalShuffle = shuffles2.head
+              if (hasPartitionNumber) {
+                assert(finalShuffle.shuffleOrigin == REPARTITION_BY_NUM)
+              } else {
+                assert(finalShuffle.shuffleOrigin == REPARTITION_BY_COL)
+              }
+            } else {
+              assert(shuffles2.size == 2)
+            }
+            val smj2 = findTopLevelSortMergeJoin(adaptive2)
+            assert(smj2.size == 1 && smj2.exists(_.isSkewJoin))
+          }
+        }
       }
     }
   }

From d93c3df5605f358e76ba3cdd314a05067dd39f92 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 11:16:26 +0800
Subject: [PATCH 21/37] remove dead code

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 1831e73fd085d..c34f1d21dab08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -654,25 +654,6 @@ case class AdaptiveSparkPlanExec(
     logicalPlan
   }
 
-  private def isFinalStage(sparkPlan: SparkPlan): Boolean = {
-    sparkPlan match {
-      // avoid top level node is Exchange
-      case _: Exchange => false
-      case plan =>
-        // Plan is regarded as a final plan iff all shuffle nodes are wrapped inside query stage
-        // and all query stages are materialized.
-        plan.find {
-          case p if p.children.exists(
-            child => child.isInstanceOf[Exchange] || child.isInstanceOf[ReusedExchangeExec]) =>
-            p match {
-              case stage: QueryStageExec if stage.isMaterialized => false
-              case _ => true
-            }
-          case _ => false
-        }.isEmpty
-    }
-  }
-
   /**
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */

From b215e2d0fdf959bd1c735297a1a33c1a078f2361 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 11:20:26 +0800
Subject: [PATCH 22/37] simplify code

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index c34f1d21dab08..fc8c3f77fc813 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -668,14 +668,13 @@ case class AdaptiveSparkPlanExec(
       preprocessingRules ++ queryStagePreparationRules(),
       Some((planChangeLogger, "AQE Replanning")))
 
-    val optimizedWithSkewedJoin = applyPhysicalRules(
-      sparkPlan,
-      preprocessingRules ++ queryStagePreparationRules(true),
-      Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join")))
-
-    // ensure the output partitioning for requiredDistribution
-    val validatedWithSkewedJoin =
-      AQEUtils.ensureRequiredDistribution(optimizedWithSkewedJoin, requiredDistribution)
+    val optimizedWithSkewedJoin =
+      AQEUtils.ensureRequiredDistribution(
+        applyPhysicalRules(
+          sparkPlan,
+          preprocessingRules ++ queryStagePreparationRules(true),
+          Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))),
+        requiredDistribution)
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
@@ -690,14 +689,11 @@ case class AdaptiveSparkPlanExec(
       case _ => plan
     }
 
-    // here are two reasons if validatedWithSkewedJoin is equal to optimizedPhysicalPlan:
-    // 1. no skewed join optimized
-    // 2. optimize skewed join doesn't satisfy requiredDistribution for final stage
-    val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(validatedWithSkewedJoin)) {
+    val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) {
       updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
     } else {
       updateBroadcastExchange(optimizedPhysicalPlan) ::
-        updateBroadcastExchange(validatedWithSkewedJoin) :: Nil
+        updateBroadcastExchange(optimizedWithSkewedJoin) :: Nil
     }
     (newPhysicalPlans, optimized)
   }

From 5b63e4db83363d7908b3403a9e40a7d1b2726aa9 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 13:39:18 +0800
Subject: [PATCH 23/37] address comment

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index fc8c3f77fc813..145903ba6b099 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -308,21 +308,23 @@ case class AdaptiveSparkPlanExec(
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
         val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan)
-        val newPhysicalPlan =
-          (Seq(currentPhysicalPlan) ++ newPhysicalPlans)
+        // We pick the first newPhysicalPlan if have the same cost otherwise pick smaller cost one
+        val (preferredNewPhysicalPlan, newCost) =
+          newPhysicalPlans
             .map(plan => (plan, costEvaluator.evaluateCost(plan)))
             .reduce { (last, current) =>
-              if (current._2 < last._2 || (current._2 == last._2  && current._1 != last._1)) {
+              if (current._2 < last._2) {
                 current
               } else {
                 last
               }
-            }._1
-
-        if (newPhysicalPlan.ne(currentPhysicalPlan)) {
-          logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$newPhysicalPlan")
-          cleanUpTempTags(newPhysicalPlan)
-          currentPhysicalPlan = newPhysicalPlan
+            }
+        val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
+        if (newCost < origCost ||
+          (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) {
+          logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan")
+          cleanUpTempTags(preferredNewPhysicalPlan)
+          currentPhysicalPlan = preferredNewPhysicalPlan
           currentLogicalPlan = newLogicalPlan
           stagesToReplace = Seq.empty[QueryStageExec]
         }
@@ -673,7 +675,7 @@ case class AdaptiveSparkPlanExec(
         applyPhysicalRules(
           sparkPlan,
           preprocessingRules ++ queryStagePreparationRules(true),
-          Some((planChangeLogger, "AQE Replanning With Optimize Skewed Join"))),
+          Some((planChangeLogger, "AQE Replanning"))),
         requiredDistribution)
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will

From 3ccc29b695b4343974df2cd2f7330863bfb1b7fc Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 13:50:24 +0800
Subject: [PATCH 24/37] style

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 145903ba6b099..4778ef6128d2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -321,7 +321,7 @@ case class AdaptiveSparkPlanExec(
             }
         val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
         if (newCost < origCost ||
-          (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) {
+            (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) {
           logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan")
           cleanUpTempTags(preferredNewPhysicalPlan)
           currentPhysicalPlan = preferredNewPhysicalPlan

From bc45d7040d5cd2a8017594605604dadceeea8fb2 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 25 Aug 2021 16:23:43 +0800
Subject: [PATCH 25/37] fix order

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 4778ef6128d2d..148bb8f28032e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -694,8 +694,8 @@ case class AdaptiveSparkPlanExec(
     val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) {
       updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
     } else {
-      updateBroadcastExchange(optimizedPhysicalPlan) ::
-        updateBroadcastExchange(optimizedWithSkewedJoin) :: Nil
+      updateBroadcastExchange(optimizedWithSkewedJoin) ::
+        updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
     }
     (newPhysicalPlans, optimized)
   }

From 580a0a4548e6c25c8191dbf9de7256045bc86e0c Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 26 Aug 2021 16:11:03 +0800
Subject: [PATCH 26/37] address comment

---
 .../sql/execution/adaptive/AQEUtils.scala     |  8 +++--
 .../adaptive/AdaptiveSparkPlanExec.scala      | 34 ++++++++++++-------
 .../adaptive/OptimizeSkewedJoin.scala         |  9 +++--
 .../execution/adaptive/simpleCosting.scala    |  8 ++---
 .../adaptive/AdaptiveQueryExecSuite.scala     |  5 +--
 5 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
index e2d19e8fed730..40b46f0100dc7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.sql.execution.adaptive
 
-import org.apache.spark.sql.catalyst.SQLConfHelper
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
+import org.apache.spark.sql.internal.SQLConf
 
-object AQEUtils extends SQLConfHelper {
+object AQEUtils {
 
   // Analyze the given plan and calculate the required distribution of this plan w.r.t. the
   // user-specified repartition.
@@ -63,7 +63,9 @@ object AQEUtils extends SQLConfHelper {
   // This method is invoked after optimizing skewed join in case we change final stage
   // output partitioning.
   def ensureRequiredDistribution(
-      plan: SparkPlan, distribution: Option[Distribution]): SparkPlan = distribution match {
+      plan: SparkPlan,
+      distribution: Option[Distribution],
+      conf: SQLConf): SparkPlan = distribution match {
     case Some(d) if !plan.outputPartitioning.satisfies(d) =>
       val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions)
       val shuffleOrigin = if (d.requiredNumPartitions.isDefined) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 148bb8f28032e..9f5581e547cee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -179,6 +179,25 @@ case class AdaptiveSparkPlanExec(
     optimized
   }
 
+  def prepareQueryStages(
+      plan: SparkPlan,
+      optimizeSkewedJoin: Boolean): SparkPlan = {
+    if (optimizeSkewedJoin) {
+      AQEUtils.ensureRequiredDistribution(
+        applyPhysicalRules(
+          plan,
+          preprocessingRules ++ queryStagePreparationRules(true),
+          Some((planChangeLogger, "AQE Replanning"))),
+        requiredDistribution,
+        conf)
+    } else {
+      applyPhysicalRules(
+        plan,
+        preprocessingRules ++ queryStagePreparationRules(),
+        Some((planChangeLogger, "AQE Replanning")))
+    }
+  }
+
   @transient private val costEvaluator =
     conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
       case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
@@ -664,19 +683,8 @@ case class AdaptiveSparkPlanExec(
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-
-    val optimizedPhysicalPlan = applyPhysicalRules(
-      sparkPlan,
-      preprocessingRules ++ queryStagePreparationRules(),
-      Some((planChangeLogger, "AQE Replanning")))
-
-    val optimizedWithSkewedJoin =
-      AQEUtils.ensureRequiredDistribution(
-        applyPhysicalRules(
-          sparkPlan,
-          preprocessingRules ++ queryStagePreparationRules(true),
-          Some((planChangeLogger, "AQE Replanning"))),
-        requiredDistribution)
+    val optimizedPhysicalPlan = prepareQueryStages(sparkPlan, false)
+    val optimizedWithSkewedJoin = prepareQueryStages(sparkPlan, true)
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index dcdc67360f98c..0b4a1c2b5a557 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -22,8 +22,9 @@ import scala.collection.mutable
 import org.apache.commons.io.FileUtils
 
 import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleOrigin}
+import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS
 import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.internal.SQLConf
 
@@ -48,9 +49,7 @@ import org.apache.spark.sql.internal.SQLConf
  * (L3, R3-1), (L3, R3-2),
  * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2)
  */
-object OptimizeSkewedJoin extends AQEShuffleReadRule {
-
-  override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS)
+object OptimizeSkewedJoin extends Rule[SparkPlan] {
 
   /**
    * A partition is considered as a skewed partition if its size is larger than the median
@@ -259,7 +258,7 @@ object OptimizeSkewedJoin extends AQEShuffleReadRule {
   object ShuffleStage {
     def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match {
       case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined &&
-        isSupported(s.shuffle) =>
+        s.shuffle.shuffleOrigin == ENSURE_REQUIREMENTS =>
         Some(s)
       case _ => None
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
index 621fb01832ccd..eaae94d9d5628 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
@@ -67,17 +67,17 @@ case class SkewJoinAwareCost(
  */
 case class SimpleCostEvaluator(forceOptimizeSkewedJoin: Boolean) extends CostEvaluator {
   override def evaluateCost(plan: SparkPlan): Cost = {
-    val shuffleNumber = plan.collect {
+    val numShuffles = plan.collect {
       case s: ShuffleExchangeLike => s
     }.size
 
     if (forceOptimizeSkewedJoin) {
-      val skewJoinNumber = plan.collect {
+      val numSkewJoins = plan.collect {
         case j: ShuffledJoin if j.isSkewJoin => j
       }.size
-      SkewJoinAwareCost(shuffleNumber, skewJoinNumber)
+      SkewJoinAwareCost(numShuffles, numSkewJoins)
     } else {
-      SimpleCost(shuffleNumber)
+      SimpleCost(numShuffles)
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index b50d271e41428..4368e26e71fcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1949,9 +1949,10 @@ class AdaptiveQueryExecSuite
                 s"JOIN skewData2 ON key1 = key2 GROUP BY key1")
             val shuffles1 = findTopLevelShuffle(adaptive1)
             assert(shuffles1.size == 3)
+            // the head shuffle is from second EnsureRequirements in queryStagePreparationRules
             assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS)
             val smj1 = findTopLevelSortMergeJoin(adaptive1)
-            assert(smj1.size == 1 && smj1.exists(_.isSkewJoin))
+            assert(smj1.size == 1 && smj1.head.isSkewJoin)
 
             // only check required distribution
             val (_, adaptive2) =
@@ -1970,7 +1971,7 @@ class AdaptiveQueryExecSuite
               assert(shuffles2.size == 2)
             }
             val smj2 = findTopLevelSortMergeJoin(adaptive2)
-            assert(smj2.size == 1 && smj2.exists(_.isSkewJoin))
+            assert(smj2.size == 1 && smj2.head.isSkewJoin)
           }
         }
       }

From bc39694258c88e338d41df7db51aa6b123f0a32a Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 26 Aug 2021 16:33:09 +0800
Subject: [PATCH 27/37] address comment

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 22 ++++++-------------
 .../adaptive/AdaptiveQueryExecSuite.scala     | 16 ++++++--------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 9f5581e547cee..8c57af999f2f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -100,8 +100,7 @@ case class AdaptiveSparkPlanExec(
   // A list of physical plan rules to be applied before creation of query stages. The physical
   // plan should reach a final status of query stages (i.e., no more addition or removal of
   // Exchange nodes) after running these rules.
-  private def queryStagePreparationRules(
-      optimizeSkewedJoin: Boolean = false): Seq[Rule[SparkPlan]] = {
+  private def queryStagePreparationRules(optimizeSkewedJoin: Boolean): Seq[Rule[SparkPlan]] = {
     val optimizeSkewedJoinRules = if (optimizeSkewedJoin) {
       Seq(OptimizeSkewedJoin,
         // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change
@@ -182,20 +181,13 @@ case class AdaptiveSparkPlanExec(
   def prepareQueryStages(
       plan: SparkPlan,
       optimizeSkewedJoin: Boolean): SparkPlan = {
-    if (optimizeSkewedJoin) {
-      AQEUtils.ensureRequiredDistribution(
-        applyPhysicalRules(
-          plan,
-          preprocessingRules ++ queryStagePreparationRules(true),
-          Some((planChangeLogger, "AQE Replanning"))),
-        requiredDistribution,
-        conf)
-    } else {
+    AQEUtils.ensureRequiredDistribution(
       applyPhysicalRules(
         plan,
-        preprocessingRules ++ queryStagePreparationRules(),
-        Some((planChangeLogger, "AQE Replanning")))
-    }
+        preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin),
+        Some((planChangeLogger, "AQE Replanning"))),
+      requiredDistribution,
+      conf)
   }
 
   @transient private val costEvaluator =
@@ -206,7 +198,7 @@ case class AdaptiveSparkPlanExec(
 
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(
-      inputPlan, queryStagePreparationRules(), Some((planChangeLogger, "AQE Preparations")))
+      inputPlan, queryStagePreparationRules(false), Some((planChangeLogger, "AQE Preparations")))
   }
 
   @volatile private var currentPhysicalPlan = initialPlan
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 4368e26e71fcc..548ba8706084a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -96,12 +96,6 @@ class AdaptiveQueryExecSuite
     (dfAdaptive.queryExecution.sparkPlan, adaptivePlan)
   }
 
-  private def findTopLevelShuffle(plan: SparkPlan): Seq[ShuffleExchangeExec] = {
-    collect(plan) {
-      case s: ShuffleExchangeExec => s
-    }
-  }
-
   private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = {
     collect(plan) {
       case j: BroadcastHashJoinExec => j
@@ -1947,9 +1941,11 @@ class AdaptiveQueryExecSuite
             val (_, adaptive1) =
               runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " +
                 s"JOIN skewData2 ON key1 = key2 GROUP BY key1")
-            val shuffles1 = findTopLevelShuffle(adaptive1)
+            val shuffles1 = collect(adaptive1) {
+              case s: ShuffleExchangeExec => s
+            }
             assert(shuffles1.size == 3)
-            // the head shuffle is from second EnsureRequirements in queryStagePreparationRules
+            // shuffles1.head is the top-level shuffle under the Aggregate operator
             assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS)
             val smj1 = findTopLevelSortMergeJoin(adaptive1)
             assert(smj1.size == 1 && smj1.head.isSkewJoin)
@@ -1958,7 +1954,9 @@ class AdaptiveQueryExecSuite
             val (_, adaptive2) =
               runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM skewData1 " +
                 s"JOIN skewData2 ON key1 = key2")
-            val shuffles2 = findTopLevelShuffle(adaptive2)
+            val shuffles2 = collect(adaptive2) {
+              case s: ShuffleExchangeExec => s
+            }
             if (hasRequiredDistribution) {
               assert(shuffles2.size == 3)
               val finalShuffle = shuffles2.head

From bb2e71375717ae9ed28ac0cb255e495dc1e334d8 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 2 Sep 2021 13:23:06 +0800
Subject: [PATCH 28/37] address comment

---
 .../sql/execution/adaptive/AQEUtils.scala     | 19 -------
 .../adaptive/AdaptiveSparkPlanExec.scala      | 15 +++---
 .../exchange/EnsureRequirements.scala         | 54 +++++++++++++------
 3 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
index 40b46f0100dc7..277af212d81f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEUtils.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.adaptive
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution, HashPartitioning, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
-import org.apache.spark.sql.internal.SQLConf
 
 object AQEUtils {
 
@@ -58,22 +57,4 @@ object AQEUtils {
       }
     case _ => Some(UnspecifiedDistribution)
   }
-
-  // Add an extra shuffle if input plan does not satisfy the required distribution.
-  // This method is invoked after optimizing skewed join in case we change final stage
-  // output partitioning.
-  def ensureRequiredDistribution(
-      plan: SparkPlan,
-      distribution: Option[Distribution],
-      conf: SQLConf): SparkPlan = distribution match {
-    case Some(d) if !plan.outputPartitioning.satisfies(d) =>
-      val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions)
-      val shuffleOrigin = if (d.requiredNumPartitions.isDefined) {
-        REPARTITION_BY_NUM
-      } else {
-        REPARTITION_BY_COL
-      }
-      ShuffleExchangeExec(d.createPartitioning(numPartitions), plan, shuffleOrigin)
-    case _ => plan
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 8c57af999f2f8..64a08b322b7de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -105,7 +105,7 @@ case class AdaptiveSparkPlanExec(
       Seq(OptimizeSkewedJoin,
         // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change
         // output partitioning, make sure we have right distribution.
-        EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined))
+        EnsureRequirements(requiredDistribution.isDefined, requiredDistribution))
     } else {
       Nil
     }
@@ -116,7 +116,7 @@ case class AdaptiveSparkPlanExec(
       // the final plan, but we do need to respect the user-specified repartition. Here we ask
       // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
       // around this case.
-      EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
+      EnsureRequirements(requiredDistribution.isDefined, requiredDistribution),
       RemoveRedundantSorts,
       DisableUnnecessaryBucketedScan
     ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules
@@ -181,13 +181,10 @@ case class AdaptiveSparkPlanExec(
   def prepareQueryStages(
       plan: SparkPlan,
       optimizeSkewedJoin: Boolean): SparkPlan = {
-    AQEUtils.ensureRequiredDistribution(
-      applyPhysicalRules(
-        plan,
-        preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin),
-        Some((planChangeLogger, "AQE Replanning"))),
-      requiredDistribution,
-      conf)
+    applyPhysicalRules(
+      plan,
+      preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin),
+      Some((planChangeLogger, "AQE Replanning")))
   }
 
   @transient private val costEvaluator =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 23716f1081d34..35ca0b3c3ef92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -38,8 +38,13 @@ import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoin
  *                               but can be false in AQE when AQE optimization may change the plan
  *                               output partitioning and need to retain the user-specified
  *                               repartition shuffles in the plan.
+ * @param requiredDistribution The root required distribution we should ensure. This value is used
+ *                             in AQE in case we change final stage output partitioning.
  */
-case class EnsureRequirements(optimizeOutRepartition: Boolean = true) extends Rule[SparkPlan] {
+case class EnsureRequirements(
+    optimizeOutRepartition: Boolean = true,
+    requiredDistribution: Option[Distribution] = None)
+  extends Rule[SparkPlan] {
 
   private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = {
     val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution
@@ -254,25 +259,40 @@ case class EnsureRequirements(optimizeOutRepartition: Boolean = true) extends Ru
     }
   }
 
-  def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin)
+  def apply(plan: SparkPlan): SparkPlan = {
+    val newPlan = plan.transformUp {
+      case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin)
         if optimizeOutRepartition &&
           (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) =>
-      def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = {
-        partitioning match {
-          case lower: HashPartitioning if upper.semanticEquals(lower) => true
-          case lower: PartitioningCollection =>
-            lower.partitionings.exists(hasSemanticEqualPartitioning)
-          case _ => false
+        def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = {
+          partitioning match {
+            case lower: HashPartitioning if upper.semanticEquals(lower) => true
+            case lower: PartitioningCollection =>
+              lower.partitionings.exists(hasSemanticEqualPartitioning)
+            case _ => false
+          }
+        }
+        if (hasSemanticEqualPartitioning(child.outputPartitioning)) {
+          child
+        } else {
+          operator
         }
-      }
-      if (hasSemanticEqualPartitioning(child.outputPartitioning)) {
-        child
-      } else {
-        operator
-      }
 
-    case operator: SparkPlan =>
-      ensureDistributionAndOrdering(reorderJoinPredicates(operator))
+      case operator: SparkPlan =>
+        ensureDistributionAndOrdering(reorderJoinPredicates(operator))
+    }
+
+    requiredDistribution match {
+      case Some(d) if !newPlan.outputPartitioning.satisfies(d) =>
+        val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions)
+        val shuffleOrigin = if (d.requiredNumPartitions.isDefined) {
+          REPARTITION_BY_NUM
+        } else {
+          REPARTITION_BY_COL
+        }
+        ShuffleExchangeExec(d.createPartitioning(numPartitions), newPlan, shuffleOrigin)
+
+      case _ => newPlan
+    }
   }
 }

From d3f013114554f951bd49ade34ce8f8b9b64a1ece Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 2 Sep 2021 13:26:05 +0800
Subject: [PATCH 29/37] nit

---
 .../spark/sql/execution/exchange/EnsureRequirements.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 35ca0b3c3ef92..363f9751fc963 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -262,8 +262,8 @@ case class EnsureRequirements(
   def apply(plan: SparkPlan): SparkPlan = {
     val newPlan = plan.transformUp {
       case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin)
-        if optimizeOutRepartition &&
-          (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) =>
+          if optimizeOutRepartition &&
+            (shuffleOrigin == REPARTITION_BY_COL || shuffleOrigin == REPARTITION_BY_NUM) =>
         def hasSemanticEqualPartitioning(partitioning: Partitioning): Boolean = {
           partitioning match {
             case lower: HashPartitioning if upper.semanticEquals(lower) => true

From 4712986d217e515197a4f6bee1ae9790fda62087 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 2 Sep 2021 13:46:49 +0800
Subject: [PATCH 30/37] nit

---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 64a08b322b7de..654a85bde657f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -116,7 +116,7 @@ case class AdaptiveSparkPlanExec(
       // the final plan, but we do need to respect the user-specified repartition. Here we ask
       // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
       // around this case.
-      EnsureRequirements(requiredDistribution.isDefined, requiredDistribution),
+      EnsureRequirements(requiredDistribution.isDefined),
       RemoveRedundantSorts,
       DisableUnnecessaryBucketedScan
     ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules

From 23ebea0a408469f5c2a70e04982ffa6261dae96c Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Sun, 5 Sep 2021 21:32:44 +0800
Subject: [PATCH 31/37] address comment

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 99 +++++++------------
 .../adaptive/OptimizeSkewedJoin.scala         | 22 ++++-
 .../exchange/EnsureRequirements.scala         | 60 ++++++-----
 3 files changed, 89 insertions(+), 92 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 654a85bde657f..eb099a0409dec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -97,30 +97,26 @@ case class AdaptiveSparkPlanExec(
     AQEUtils.getRequiredDistribution(inputPlan)
   }
 
+  @transient private val costEvaluator =
+    conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
+      case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
+      case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
+    }
+
   // A list of physical plan rules to be applied before creation of query stages. The physical
   // plan should reach a final status of query stages (i.e., no more addition or removal of
   // Exchange nodes) after running these rules.
-  private def queryStagePreparationRules(optimizeSkewedJoin: Boolean): Seq[Rule[SparkPlan]] = {
-    val optimizeSkewedJoinRules = if (optimizeSkewedJoin) {
-      Seq(OptimizeSkewedJoin,
-        // Add the EnsureRequirements rule here since OptimizeSkewedJoin will change
-        // output partitioning, make sure we have right distribution.
-        EnsureRequirements(requiredDistribution.isDefined, requiredDistribution))
-    } else {
-      Nil
-    }
-
-    Seq(
-      RemoveRedundantProjects,
-      // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for
-      // the final plan, but we do need to respect the user-specified repartition. Here we ask
-      // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
-      // around this case.
-      EnsureRequirements(requiredDistribution.isDefined),
-      RemoveRedundantSorts,
-      DisableUnnecessaryBucketedScan
-    ) ++ optimizeSkewedJoinRules ++ context.session.sessionState.queryStagePrepRules
-  }
+  @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
+    RemoveRedundantProjects,
+    // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for
+    // the final plan, but we do need to respect the user-specified repartition. Here we ask
+    // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
+    // around this case.
+    EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
+    RemoveRedundantSorts,
+    DisableUnnecessaryBucketedScan,
+    OptimizeSkewedJoin(requiredDistribution, costEvaluator)
+  ) ++ context.session.sessionState.queryStagePrepRules
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
   // optimizations should be stage-independent.
@@ -178,24 +174,9 @@ case class AdaptiveSparkPlanExec(
     optimized
   }
 
-  def prepareQueryStages(
-      plan: SparkPlan,
-      optimizeSkewedJoin: Boolean): SparkPlan = {
-    applyPhysicalRules(
-      plan,
-      preprocessingRules ++ queryStagePreparationRules(optimizeSkewedJoin),
-      Some((planChangeLogger, "AQE Replanning")))
-  }
-
-  @transient private val costEvaluator =
-    conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
-      case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
-      case _ => SimpleCostEvaluator(conf.getConf(SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN))
-    }
-
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(
-      inputPlan, queryStagePreparationRules(false), Some((planChangeLogger, "AQE Preparations")))
+      inputPlan, queryStagePreparationRules, Some((planChangeLogger, "AQE Preparations")))
   }
 
   @volatile private var currentPhysicalPlan = initialPlan
@@ -315,24 +296,14 @@ case class AdaptiveSparkPlanExec(
         // plans are updated, we can clear the query stage list because at this point the two plans
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
-        val (newPhysicalPlans, newLogicalPlan) = reOptimize(logicalPlan)
-        // We pick the first newPhysicalPlan if have the same cost otherwise pick smaller cost one
-        val (preferredNewPhysicalPlan, newCost) =
-          newPhysicalPlans
-            .map(plan => (plan, costEvaluator.evaluateCost(plan)))
-            .reduce { (last, current) =>
-              if (current._2 < last._2) {
-                current
-              } else {
-                last
-              }
-            }
+        val (newPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
         val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
+        val newCost = costEvaluator.evaluateCost(newPhysicalPlan)
         if (newCost < origCost ||
-            (newCost == origCost && currentPhysicalPlan != preferredNewPhysicalPlan)) {
-          logOnLevel(s"Plan changed from\n$currentPhysicalPlan\nto\n$preferredNewPhysicalPlan")
-          cleanUpTempTags(preferredNewPhysicalPlan)
-          currentPhysicalPlan = preferredNewPhysicalPlan
+            (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) {
+          logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan")
+          cleanUpTempTags(newPhysicalPlan)
+          currentPhysicalPlan = newPhysicalPlan
           currentLogicalPlan = newLogicalPlan
           stagesToReplace = Seq.empty[QueryStageExec]
         }
@@ -668,12 +639,14 @@ case class AdaptiveSparkPlanExec(
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
   private def reOptimize(
-      logicalPlan: LogicalPlan): (Seq[SparkPlan], LogicalPlan) = context.qe.withCteMap {
+      logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = context.qe.withCteMap {
     logicalPlan.invalidateStatsCache()
     val optimized = optimizer.execute(logicalPlan)
     val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-    val optimizedPhysicalPlan = prepareQueryStages(sparkPlan, false)
-    val optimizedWithSkewedJoin = prepareQueryStages(sparkPlan, true)
+    val newPlan = applyPhysicalRules(
+      sparkPlan,
+      preprocessingRules ++ queryStagePreparationRules,
+      Some((planChangeLogger, "AQE Replanning")))
 
     // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
     // add the `BroadcastExchangeExec` node manually in the DPP subquery,
@@ -682,19 +655,13 @@ case class AdaptiveSparkPlanExec(
     // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery.
     // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan
     // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
-    def updateBroadcastExchange(plan: SparkPlan): SparkPlan = currentPhysicalPlan match {
+    val finalPlan = currentPhysicalPlan match {
       case b: BroadcastExchangeLike
-        if (!plan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(plan))
-      case _ => plan
+        if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
+      case _ => newPlan
     }
 
-    val newPhysicalPlans = if (optimizedPhysicalPlan.fastEquals(optimizedWithSkewedJoin)) {
-      updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
-    } else {
-      updateBroadcastExchange(optimizedWithSkewedJoin) ::
-        updateBroadcastExchange(optimizedPhysicalPlan) :: Nil
-    }
-    (newPhysicalPlans, optimized)
+    (finalPlan, optimized)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 0b4a1c2b5a557..72cc3c069f6bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -22,9 +22,10 @@ import scala.collection.mutable
 import org.apache.commons.io.FileUtils
 
 import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.physical.Distribution
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements}
 import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.internal.SQLConf
 
@@ -49,7 +50,10 @@ import org.apache.spark.sql.internal.SQLConf
  * (L3, R3-1), (L3, R3-2),
  * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2)
  */
-object OptimizeSkewedJoin extends Rule[SparkPlan] {
+case class OptimizeSkewedJoin(
+    requiredDistribution: Option[Distribution],
+    costEvaluator: CostEvaluator)
+  extends Rule[SparkPlan] {
 
   /**
    * A partition is considered as a skewed partition if its size is larger than the median
@@ -249,7 +253,19 @@ object OptimizeSkewedJoin extends Rule[SparkPlan] {
       // SHJ
       //   Shuffle
       //   Shuffle
-      optimizeSkewJoin(plan)
+      val optimized =
+        EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)
+          .apply(optimizeSkewJoin(plan))
+      val originCost = costEvaluator.evaluateCost(plan)
+      val optimizedCost = costEvaluator.evaluateCost(optimized)
+      // two cases we will pick new plan:
+      //   1. optimize the skew join without extra shuffle
+      //   2. optimize the skew join with extra shuffle but the costEvaluator think it's better
+      if (optimizedCost < originCost || (originCost == optimizedCost && optimized != plan)) {
+        optimized
+      } else {
+        plan
+      }
     } else {
       plan
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 363f9751fc963..c2a2f8d2df023 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -46,15 +46,15 @@ case class EnsureRequirements(
     requiredDistribution: Option[Distribution] = None)
   extends Rule[SparkPlan] {
 
-  private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = {
-    val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution
-    val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering
-    var children: Seq[SparkPlan] = operator.children
-    assert(requiredChildDistributions.length == children.length)
-    assert(requiredChildOrderings.length == children.length)
-
+  private def ensureDistributionAndOrdering(
+      originChildren: Seq[SparkPlan],
+      requiredChildDistributions: Seq[Distribution],
+      requiredChildOrderings: Seq[Seq[SortOrder]],
+      isRootDistribution: Boolean): Seq[SparkPlan] = {
+    assert(requiredChildDistributions.length == originChildren.length)
+    assert(requiredChildOrderings.length == originChildren.length)
     // Ensure that the operator's children satisfy their output distribution requirements.
-    children = children.zip(requiredChildDistributions).map {
+    var children = originChildren.zip(requiredChildDistributions).map {
       case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
         child
       case (child, BroadcastDistribution(mode)) =>
@@ -62,7 +62,16 @@ case class EnsureRequirements(
       case (child, distribution) =>
         val numPartitions = distribution.requiredNumPartitions
           .getOrElse(conf.numShufflePartitions)
-        ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child)
+        val shuffleOrigin = if (isRootDistribution) {
+          if (distribution.requiredNumPartitions.isDefined) {
+            REPARTITION_BY_NUM
+          } else {
+            REPARTITION_BY_COL
+          }
+        } else {
+          ENSURE_REQUIREMENTS
+        }
+        ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child, shuffleOrigin)
     }
 
     // Get the indexes of children which have specified distribution requirements and need to have
@@ -83,7 +92,7 @@ case class EnsureRequirements(
           index => requiredChildDistributions(index).requiredNumPartitions
         }.toSet
         assert(numPartitionsSet.size <= 1,
-          s"$operator have incompatible requirements of the number of partitions for its children")
+          s"$requiredChildDistributions have incompatible requirements of the number of partitions")
         numPartitionsSet.headOption
       }
 
@@ -138,7 +147,7 @@ case class EnsureRequirements(
       }
     }
 
-    operator.withNewChildren(children)
+    children
   }
 
   private def reorder(
@@ -279,20 +288,25 @@ case class EnsureRequirements(
         }
 
       case operator: SparkPlan =>
-        ensureDistributionAndOrdering(reorderJoinPredicates(operator))
+        val reordered = reorderJoinPredicates(operator)
+        val newChildren = ensureDistributionAndOrdering(
+          reordered.children,
+          reordered.requiredChildDistribution,
+          reordered.requiredChildOrdering,
+          false)
+        reordered.withNewChildren(newChildren)
     }
 
-    requiredDistribution match {
-      case Some(d) if !newPlan.outputPartitioning.satisfies(d) =>
-        val numPartitions = d.requiredNumPartitions.getOrElse(conf.numShufflePartitions)
-        val shuffleOrigin = if (d.requiredNumPartitions.isDefined) {
-          REPARTITION_BY_NUM
-        } else {
-          REPARTITION_BY_COL
-        }
-        ShuffleExchangeExec(d.createPartitioning(numPartitions), newPlan, shuffleOrigin)
-
-      case _ => newPlan
+    if (requiredDistribution.isDefined) {
+      val finalPlan = ensureDistributionAndOrdering(
+        newPlan :: Nil,
+        requiredDistribution.get :: Nil,
+        Seq(Nil),
+        true)
+      assert(finalPlan.size == 1)
+      finalPlan.head
+    } else {
+      newPlan
     }
   }
 }

From ef0765f820185d05902ee35bd3bcc7851c4a1457 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 7 Sep 2021 16:00:14 +0800
Subject: [PATCH 32/37] pass EnsureRequirements

---
 .../sql/execution/adaptive/AdaptiveSparkPlanExec.scala      | 4 +++-
 .../spark/sql/execution/adaptive/OptimizeSkewedJoin.scala   | 6 ++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index eb099a0409dec..5284c4979bb1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -115,7 +115,9 @@ case class AdaptiveSparkPlanExec(
     EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
     RemoveRedundantSorts,
     DisableUnnecessaryBucketedScan,
-    OptimizeSkewedJoin(requiredDistribution, costEvaluator)
+    OptimizeSkewedJoin(
+      EnsureRequirements(requiredDistribution.isDefined, requiredDistribution),
+      costEvaluator)
   ) ++ context.session.sessionState.queryStagePrepRules
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 72cc3c069f6bd..58e88a6931d1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -51,7 +51,7 @@ import org.apache.spark.sql.internal.SQLConf
  * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2)
  */
 case class OptimizeSkewedJoin(
-    requiredDistribution: Option[Distribution],
+    ensureRequirements: EnsureRequirements,
     costEvaluator: CostEvaluator)
   extends Rule[SparkPlan] {
 
@@ -253,9 +253,7 @@ case class OptimizeSkewedJoin(
       // SHJ
       //   Shuffle
       //   Shuffle
-      val optimized =
-        EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)
-          .apply(optimizeSkewJoin(plan))
+      val optimized = ensureRequirements.apply(optimizeSkewJoin(plan))
       val originCost = costEvaluator.evaluateCost(plan)
       val optimizedCost = costEvaluator.evaluateCost(optimized)
       // two cases we will pick new plan:

From 76c363daacfcf35a08378859c710f3bd916bc153 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 7 Sep 2021 16:03:12 +0800
Subject: [PATCH 33/37] simplify

---
 .../spark/sql/execution/adaptive/OptimizeSkewedJoin.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 58e88a6931d1c..641baad9877a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -259,7 +259,7 @@ case class OptimizeSkewedJoin(
       // two cases we will pick new plan:
       //   1. optimize the skew join without extra shuffle
       //   2. optimize the skew join with extra shuffle but the costEvaluator think it's better
-      if (optimizedCost < originCost || (originCost == optimizedCost && optimized != plan)) {
+      if (optimizedCost <= originCost) {
         optimized
       } else {
         plan

From 89610842f656d87f531a23337d027f6ec1f1ba08 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 7 Sep 2021 16:42:53 +0800
Subject: [PATCH 34/37] nit

---
 .../apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
index 641baad9877a5..2fe5b18a75ec8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -22,7 +22,6 @@ import scala.collection.mutable
 import org.apache.commons.io.FileUtils
 
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.physical.Distribution
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements}

From 5ba73c4e2958e807b11f5de4f9be56ea4b854964 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 7 Sep 2021 18:01:51 +0800
Subject: [PATCH 35/37] EnsureRequirements

---
 .../adaptive/AdaptiveSparkPlanExec.scala      | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 5284c4979bb1e..13c9528323ae8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -106,19 +106,21 @@ case class AdaptiveSparkPlanExec(
   // A list of physical plan rules to be applied before creation of query stages. The physical
   // plan should reach a final status of query stages (i.e., no more addition or removal of
   // Exchange nodes) after running these rules.
-  @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
-    RemoveRedundantProjects,
+  @transient private val queryStagePreparationRules: Seq[Rule[SparkPlan]] = {
     // For cases like `df.repartition(a, b).select(c)`, there is no distribution requirement for
     // the final plan, but we do need to respect the user-specified repartition. Here we ask
     // `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
     // around this case.
-    EnsureRequirements(optimizeOutRepartition = requiredDistribution.isDefined),
-    RemoveRedundantSorts,
-    DisableUnnecessaryBucketedScan,
-    OptimizeSkewedJoin(
-      EnsureRequirements(requiredDistribution.isDefined, requiredDistribution),
-      costEvaluator)
-  ) ++ context.session.sessionState.queryStagePrepRules
+    val ensureRequirements =
+      EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)
+    Seq(
+      RemoveRedundantProjects,
+      ensureRequirements,
+      RemoveRedundantSorts,
+      DisableUnnecessaryBucketedScan,
+      OptimizeSkewedJoin(ensureRequirements, costEvaluator)
+    ) ++ context.session.sessionState.queryStagePrepRules
+  }
 
   // A list of physical optimizer rules to be applied to a new stage before its execution. These
   // optimizations should be stage-independent.

From ca6332167f20ca85c4ae23100a475530f9118a8e Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 9 Sep 2021 10:38:48 +0800
Subject: [PATCH 36/37] pull out shuffle origin

---
 .../exchange/EnsureRequirements.scala         | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index c2a2f8d2df023..687b4c4490e09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -50,7 +50,7 @@ case class EnsureRequirements(
       originChildren: Seq[SparkPlan],
       requiredChildDistributions: Seq[Distribution],
       requiredChildOrderings: Seq[Seq[SortOrder]],
-      isRootDistribution: Boolean): Seq[SparkPlan] = {
+      shuffleOrigin: ShuffleOrigin): Seq[SparkPlan] = {
     assert(requiredChildDistributions.length == originChildren.length)
     assert(requiredChildOrderings.length == originChildren.length)
     // Ensure that the operator's children satisfy their output distribution requirements.
@@ -62,15 +62,6 @@ case class EnsureRequirements(
       case (child, distribution) =>
         val numPartitions = distribution.requiredNumPartitions
           .getOrElse(conf.numShufflePartitions)
-        val shuffleOrigin = if (isRootDistribution) {
-          if (distribution.requiredNumPartitions.isDefined) {
-            REPARTITION_BY_NUM
-          } else {
-            REPARTITION_BY_COL
-          }
-        } else {
-          ENSURE_REQUIREMENTS
-        }
         ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child, shuffleOrigin)
     }
 
@@ -293,16 +284,21 @@ case class EnsureRequirements(
           reordered.children,
           reordered.requiredChildDistribution,
           reordered.requiredChildOrdering,
-          false)
+          ENSURE_REQUIREMENTS)
         reordered.withNewChildren(newChildren)
     }
 
     if (requiredDistribution.isDefined) {
+      val shuffleOrigin = if (requiredDistribution.get.requiredNumPartitions.isDefined) {
+        REPARTITION_BY_NUM
+      } else {
+        REPARTITION_BY_COL
+      }
       val finalPlan = ensureDistributionAndOrdering(
         newPlan :: Nil,
         requiredDistribution.get :: Nil,
         Seq(Nil),
-        true)
+        shuffleOrigin)
       assert(finalPlan.size == 1)
       finalPlan.head
     } else {

From f5e4b911ea608cdf863215feb6e13996a57cb294 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Sat, 11 Sep 2021 18:00:19 +0800
Subject: [PATCH 37/37] address comment

---
 .../sql/execution/adaptive/simpleCosting.scala |  6 +++++-
 .../exchange/EnsureRequirements.scala          | 18 +++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
index eaae94d9d5628..864563be38557 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
@@ -36,7 +36,11 @@ case class SimpleCost(value: Long) extends Cost {
 }
 
 /**
- * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number
+ * A skew join aware implementation of [[Cost]], which consider shuffle number and skew join number.
+ *
+ * We always pick the cost which has more skew join even if it introduces one or more extra shuffle.
+ * Otherwise, if two costs have the same number of skew join or no skew join, we will pick the one
+ * with small number of shuffle.
  */
 case class SkewJoinAwareCost(
     numShuffles: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 687b4c4490e09..86b2344629d26 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -47,14 +47,14 @@ case class EnsureRequirements(
   extends Rule[SparkPlan] {
 
   private def ensureDistributionAndOrdering(
-      originChildren: Seq[SparkPlan],
+      originalChildren: Seq[SparkPlan],
       requiredChildDistributions: Seq[Distribution],
       requiredChildOrderings: Seq[Seq[SortOrder]],
       shuffleOrigin: ShuffleOrigin): Seq[SparkPlan] = {
-    assert(requiredChildDistributions.length == originChildren.length)
-    assert(requiredChildOrderings.length == originChildren.length)
+    assert(requiredChildDistributions.length == originalChildren.length)
+    assert(requiredChildOrderings.length == originalChildren.length)
     // Ensure that the operator's children satisfy their output distribution requirements.
-    var children = originChildren.zip(requiredChildDistributions).map {
+    var newChildren = originalChildren.zip(requiredChildDistributions).map {
       case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
         child
       case (child, BroadcastDistribution(mode)) =>
@@ -74,7 +74,7 @@ case class EnsureRequirements(
     }.map(_._2)
 
     val childrenNumPartitions =
-      childrenIndexes.map(children(_).outputPartitioning.numPartitions).toSet
+      childrenIndexes.map(newChildren(_).outputPartitioning.numPartitions).toSet
 
     if (childrenNumPartitions.size > 1) {
       // Get the number of partitions which is explicitly required by the distributions.
@@ -92,7 +92,7 @@ case class EnsureRequirements(
       // 1. We should avoid shuffling these children.
       // 2. We should have a reasonable parallelism.
       val nonShuffleChildrenNumPartitions =
-        childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec])
+        childrenIndexes.map(newChildren).filterNot(_.isInstanceOf[ShuffleExchangeExec])
           .map(_.outputPartitioning.numPartitions)
       val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) {
         if (nonShuffleChildrenNumPartitions.length == childrenIndexes.length) {
@@ -111,7 +111,7 @@ case class EnsureRequirements(
 
       val targetNumPartitions = requiredNumPartitions.getOrElse(expectedChildrenNumPartitions)
 
-      children = children.zip(requiredChildDistributions).zipWithIndex.map {
+      newChildren = newChildren.zip(requiredChildDistributions).zipWithIndex.map {
         case ((child, distribution), index) if childrenIndexes.contains(index) =>
           if (child.outputPartitioning.numPartitions == targetNumPartitions) {
             child
@@ -129,7 +129,7 @@ case class EnsureRequirements(
     }
 
     // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings:
-    children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
+    newChildren = newChildren.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
       // If child.outputOrdering already satisfies the requiredOrdering, we do not need to sort.
       if (SortOrder.orderingSatisfies(child.outputOrdering, requiredOrdering)) {
         child
@@ -138,7 +138,7 @@ case class EnsureRequirements(
       }
     }
 
-    children
+    newChildren
   }
 
   private def reorder(