[SPARK-24497][SQL] revert TreeNode changes, introduce reset() on SparkPlan, handle cases when ExchangeCoordinator is set, minor fixes

peter-toth · peter-toth · commit 5f69d60caa43 · 2019-01-16T10:18:27.000+01:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -235,11 +235,6 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     if (changed) makeCopy(newArgs) else this
   }
 
-  /**
-  * Returns a deep copy of the subtree from the node.
-  */
-  def makeDeepCopy(): BaseType = mapChildren(_.makeDeepCopy(), true)
-
   /**
    * Returns a copy of this node where `rule` has been recursively applied to the tree.
    * When `rule` does not apply to a given node it is left unchanged.
@@ -294,13 +289,13 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
   /**
    * Returns a copy of this node where `f` has been applied to all the nodes children.
    */
-  def mapChildren(f: BaseType => BaseType, forceCopy: Boolean = false): BaseType = {
-    if (forceCopy || children.nonEmpty) {
+  def mapChildren(f: BaseType => BaseType): BaseType = {
+    if (children.nonEmpty) {
       var changed = false
       def mapChild(child: Any): Any = child match {
         case arg: TreeNode[_] if containsChild(arg) =>
           val newChild = f(arg.asInstanceOf[BaseType])
-          if (forceCopy || !(newChild fastEquals arg)) {
+          if (!(newChild fastEquals arg)) {
             changed = true
             newChild
           } else {
@@ -319,7 +314,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
             arg2.asInstanceOf[BaseType]
           }
 
-          if (forceCopy || !(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) {
+          if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) {
             changed = true
             (newChild1, newChild2)
           } else {
@@ -331,15 +326,15 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
       val newArgs = mapProductIterator {
         case arg: TreeNode[_] if containsChild(arg) =>
           val newChild = f(arg.asInstanceOf[BaseType])
-          if (forceCopy || !(newChild fastEquals arg)) {
+          if (!(newChild fastEquals arg)) {
             changed = true
             newChild
           } else {
             arg
           }
         case Some(arg: TreeNode[_]) if containsChild(arg) =>
           val newChild = f(arg.asInstanceOf[BaseType])
-          if (forceCopy || !(newChild fastEquals arg)) {
+          if (!(newChild fastEquals arg)) {
             changed = true
             Some(newChild)
           } else {
@@ -348,7 +343,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
         case m: Map[_, _] => m.mapValues {
           case arg: TreeNode[_] if containsChild(arg) =>
             val newChild = f(arg.asInstanceOf[BaseType])
-            if (forceCopy || !(newChild fastEquals arg)) {
+            if (!(newChild fastEquals arg)) {
               changed = true
               newChild
             } else {
@@ -362,7 +357,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
         case nonChild: AnyRef => nonChild
         case null => null
       }
-      if (forceCopy || changed) makeCopy(newArgs) else this
+      if (changed) makeCopy(newArgs) else this
     } else {
       this
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2056,6 +2056,8 @@ class SQLConf extends Serializable with Logging {
   def setCommandRejectsSparkCoreConfs: Boolean =
     getConf(SQLConf.SET_COMMAND_REJECTS_SPARK_CORE_CONFS)
 
+  def recursionLevelLimit: Int = getConf(SQLConf.RECURSION_LEVEL_LIMIT)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -212,18 +212,39 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     }
   }
 
+  protected def resetSubqueries(): Unit = {
+    expressions.foreach {
+      _.foreach {
+        case e: ExecSubqueryExpression => e.plan.reset()
+        case _ =>
+      }
+    }
+    runningSubqueries.clear()
+  }
+
+  final def reset(): Unit = {
+    children.foreach(_.reset())
+    synchronized {
+      if (prepared) {
+        resetSubqueries()
+        doReset()
+        prepared = false
+      }
+    }
+  }
+
   /**
    * Overridden by concrete implementations of SparkPlan. It is guaranteed to run before any
    * `execute` of SparkPlan. This is helpful if we want to set up some state before executing the
    * query, e.g., `BroadcastHashJoin` uses it to broadcast asynchronously.
    *
    * @note `prepare` method has already walked down the tree, so the implementation doesn't have
    * to call children's `prepare` methods.
-   *
-   * This will only be called once, protected by `this`.
    */
   protected def doPrepare(): Unit = {}
 
+  protected def doReset(): Unit = {}
+
   /**
    * Produces the result of the query as an `RDD[InternalRow]`
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.LongType
@@ -243,14 +244,20 @@ case class RecursiveTableExec(
     var tempCount = temp.count()
     var result = temp
     var level = 0
-    val levelLimit = conf.getConf(SQLConf.RECURSION_LEVEL_LIMIT)
+    val levelLimit = conf.recursionLevelLimit
     do {
       if (level > levelLimit) {
         throw new SparkException("Recursion level limit reached but query hasn't exhausted, try " +
           s"increasing ${SQLConf.RECURSION_LEVEL_LIMIT.key}")
       }
 
-      val newRecursiveTerm = recursiveTerm.makeDeepCopy()
+      val newRecursiveTerm = recursiveTerm.transform {
+        case se @ ShuffleExchangeExec(_, _, co) =>
+          co.map(c => se.copy(coordinator = Some(c.copy))).getOrElse(se)
+      }
+      if (level > 0) {
+        newRecursiveTerm.reset()
+      }
       newRecursiveTerm.foreach {
         _ match {
           case rr: RecursiveReferenceExec if rr.name == name => rr.recursiveTable = temp
@@ -726,37 +733,48 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
   @transient
-  private lazy val relationFuture: Future[Array[InternalRow]] = {
-    // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here.
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    Future {
-      // This will run in another thread. Set the execution id so that we can connect these jobs
-      // with the correct execution.
-      SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
-        val beforeCollect = System.nanoTime()
-        // Note that we use .executeCollect() because we don't want to convert data to Scala types
-        val rows: Array[InternalRow] = child.executeCollect()
-        val beforeBuild = System.nanoTime()
-        longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
-        val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
-        longMetric("dataSize") += dataSize
-
-        SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
-        rows
-      }
-    }(SubqueryExec.executionContext)
+  private var relationFuture: Future[Array[InternalRow]] = _
+
+  private def getRelationFuture(): Future[Array[InternalRow]] = {
+    if (relationFuture == null) {
+      // relationFuture is used in "doExecute". Therefore we can get the execution id correctly
+      // here.
+      val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+      relationFuture = Future {
+        // This will run in another thread. Set the execution id so that we can connect these jobs
+        // with the correct execution.
+        SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
+          val beforeCollect = System.nanoTime()
+          // Note that we use .executeCollect() because we don't want to convert data to Scala types
+          val rows: Array[InternalRow] = child.executeCollect()
+          val beforeBuild = System.nanoTime()
+          longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
+          val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
+          longMetric("dataSize") += dataSize
+
+          SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
+          rows
+        }
+      }(SubqueryExec.executionContext)
+    }
+
+    relationFuture
   }
 
   protected override def doPrepare(): Unit = {
-    relationFuture
+    getRelationFuture()
+  }
+
+  override protected def doReset(): Unit = {
+    relationFuture = null
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
     child.execute()
   }
 
   override def executeCollect(): Array[InternalRow] = {
-    ThreadUtils.awaitResult(relationFuture, Duration.Inf)
+    ThreadUtils.awaitResult(getRelationFuture(), Duration.Inf)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -66,74 +66,85 @@ case class BroadcastExchangeExec(
   }
 
   @transient
-  private lazy val relationFuture: Future[broadcast.Broadcast[Any]] = {
-    // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here.
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    Future {
-      // This will run in another thread. Set the execution id so that we can connect these jobs
-      // with the correct execution.
-      SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
-        try {
-          val beforeCollect = System.nanoTime()
-          // Use executeCollect/executeCollectIterator to avoid conversion to Scala types
-          val (numRows, input) = child.executeCollectIterator()
-          if (numRows >= 512000000) {
-            throw new SparkException(
-              s"Cannot broadcast the table with 512 million or more rows: $numRows rows")
+  private var relationFuture: Future[broadcast.Broadcast[Any]] = _
+
+  private def getRelationFuture() = {
+    if (relationFuture == null) {
+      // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly
+      // here.
+      val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+      relationFuture = Future {
+        // This will run in another thread. Set the execution id so that we can connect these jobs
+        // with the correct execution.
+        SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
+          try {
+            val beforeCollect = System.nanoTime()
+            // Use executeCollect/executeCollectIterator to avoid conversion to Scala types
+            val (numRows, input) = child.executeCollectIterator()
+            if (numRows >= 512000000) {
+              throw new SparkException(
+                s"Cannot broadcast the table with 512 million or more rows: $numRows rows")
+            }
+
+            val beforeBuild = System.nanoTime()
+            longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
+
+            // Construct the relation.
+            val relation = mode.transform(input, Some(numRows))
+
+            val dataSize = relation match {
+              case map: HashedRelation =>
+                map.estimatedSize
+              case arr: Array[InternalRow] =>
+                arr.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
+              case _ =>
+                throw new SparkException("[BUG] BroadcastMode.transform returned unexpected " +
+                  "type: " + relation.getClass.getName)
+            }
+
+            longMetric("dataSize") += dataSize
+            if (dataSize >= (8L << 30)) {
+              throw new SparkException(
+                s"Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30} GB")
+            }
+
+            val beforeBroadcast = System.nanoTime()
+            longMetric("buildTime") += (beforeBroadcast - beforeBuild) / 1000000
+
+            // Broadcast the relation
+            val broadcasted = sparkContext.broadcast(relation)
+            longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
+
+            SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
+            broadcasted
+          } catch {
+            // SPARK-24294: To bypass scala bug: https://github.com/scala/bug/issues/9554, we throw
+            // SparkFatalException, which is a subclass of Exception. ThreadUtils.awaitResult
+            // will catch this exception and re-throw the wrapped fatal throwable.
+            case oe: OutOfMemoryError =>
+              throw new SparkFatalException(
+                new OutOfMemoryError(s"Not enough memory to build and broadcast the table to " +
+                s"all worker nodes. As a workaround, you can either disable broadcast by setting " +
+                s"${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key} to -1 or increase the spark driver " +
+                s"memory by setting ${SparkLauncher.DRIVER_MEMORY} to a higher value")
+                .initCause(oe.getCause))
+            case e if !NonFatal(e) =>
+              throw new SparkFatalException(e)
           }
-
-          val beforeBuild = System.nanoTime()
-          longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
-
-          // Construct the relation.
-          val relation = mode.transform(input, Some(numRows))
-
-          val dataSize = relation match {
-            case map: HashedRelation =>
-              map.estimatedSize
-            case arr: Array[InternalRow] =>
-              arr.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
-            case _ =>
-              throw new SparkException("[BUG] BroadcastMode.transform returned unexpected type: " +
-                  relation.getClass.getName)
-          }
-
-          longMetric("dataSize") += dataSize
-          if (dataSize >= (8L << 30)) {
-            throw new SparkException(
-              s"Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30} GB")
-          }
-
-          val beforeBroadcast = System.nanoTime()
-          longMetric("buildTime") += (beforeBroadcast - beforeBuild) / 1000000
-
-          // Broadcast the relation
-          val broadcasted = sparkContext.broadcast(relation)
-          longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
-
-          SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
-          broadcasted
-        } catch {
-          // SPARK-24294: To bypass scala bug: https://github.com/scala/bug/issues/9554, we throw
-          // SparkFatalException, which is a subclass of Exception. ThreadUtils.awaitResult
-          // will catch this exception and re-throw the wrapped fatal throwable.
-          case oe: OutOfMemoryError =>
-            throw new SparkFatalException(
-              new OutOfMemoryError(s"Not enough memory to build and broadcast the table to " +
-              s"all worker nodes. As a workaround, you can either disable broadcast by setting " +
-              s"${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key} to -1 or increase the spark driver " +
-              s"memory by setting ${SparkLauncher.DRIVER_MEMORY} to a higher value")
-              .initCause(oe.getCause))
-          case e if !NonFatal(e) =>
-            throw new SparkFatalException(e)
         }
-      }
-    }(BroadcastExchangeExec.executionContext)
+      }(BroadcastExchangeExec.executionContext)
+    }
+
+    relationFuture
   }
 
   override protected def doPrepare(): Unit = {
     // Materialize the future.
-    relationFuture
+    getRelationFuture()
+  }
+
+  override protected def doReset(): Unit = {
+    relationFuture = null
   }
 
   override protected def doExecute(): RDD[InternalRow] = {
@@ -143,7 +154,7 @@ case class BroadcastExchangeExec(
 
   override protected[sql] def doExecuteBroadcast[T](): broadcast.Broadcast[T] = {
     try {
-      ThreadUtils.awaitResult(relationFuture, timeout).asInstanceOf[broadcast.Broadcast[T]]
+      ThreadUtils.awaitResult(getRelationFuture(), timeout).asInstanceOf[broadcast.Broadcast[T]]
     } catch {
       case ex: TimeoutException =>
         logError(s"Could not execute broadcast in ${timeout.toSeconds} secs.", ex)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
@@ -274,4 +274,7 @@ class ExchangeCoordinator(
   override def toString: String = {
     s"coordinator[target post-shuffle partition size: $advisoryTargetPostShuffleInputSize]"
   }
+
+  def copy: ExchangeCoordinator =
+    new ExchangeCoordinator(advisoryTargetPostShuffleInputSize, minNumPostShufflePartitions)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
diff --git a/sql/core/src/test/resources/sql-tests/inputs/recursion.sql b/sql/core/src/test/resources/sql-tests/inputs/recursion.sql

Original file line number	Diff line number	Diff line change
`@@ -274,4 +274,7 @@ class ExchangeCoordinator(`
`274`	`274`	`override def toString: String = {`
`275`	`275`	`s"coordinator[target post-shuffle partition size: $advisoryTargetPostShuffleInputSize]"`
`276`	`276`	`}`
	`277`	`+`
	`278`	`+ def copy: ExchangeCoordinator =`
	`279`	`+ new ExchangeCoordinator(advisoryTargetPostShuffleInputSize, minNumPostShufflePartitions)`
`277`	`280`	`}`