Add developer api and a config to enable executor-side broadcast. Refactoring.

viirya · viirya · commit f50cf31e8895 · 2016-09-29T13:08:55.000Z
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1402,10 +1402,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   }
 
   /**
+   * :: DeveloperApi ::
    * Broadcast a read-only variable to the cluster, returning a
    * [[org.apache.spark.broadcast.Broadcast]] object for reading it in distributed functions.
    * The variable will be sent to each cluster only once.
    */
+  @DeveloperApi
   def broadcastRDDOnExecutor[T: ClassTag, U: ClassTag](
       rdd: RDD[T], mode: BroadcastMode[T]): Broadcast[U] = {
     assertNotStopped()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -45,9 +45,17 @@ case class BroadcastExchangeExec[T: ClassTag](
     mode: broadcast.BroadcastMode[InternalRow],
     child: SparkPlan) extends Exchange {
 
-  override lazy val metrics = Map(
-    "buildTime" -> SQLMetrics.createMetric(sparkContext, "time to build (ms)"),
-    "broadcastTime" -> SQLMetrics.createMetric(sparkContext, "time to broadcast (ms)"))
+  override lazy val metrics = if (sqlContext.conf.executorSideBroadcastEnabled) {
+      Map(
+        "buildTime" -> SQLMetrics.createMetric(sparkContext, "time to build (ms)"),
+        "broadcastTime" -> SQLMetrics.createMetric(sparkContext, "time to broadcast (ms)"))
+    } else {
+      Map(
+        "dataSize" -> SQLMetrics.createMetric(sparkContext, "data size (bytes)"),
+        "collectTime" -> SQLMetrics.createMetric(sparkContext, "time to collect (ms)"),
+        "buildTime" -> SQLMetrics.createMetric(sparkContext, "time to build (ms)"),
+        "broadcastTime" -> SQLMetrics.createMetric(sparkContext, "time to broadcast (ms)"))
+    }
 
   override def outputPartitioning: Partitioning = BroadcastPartitioning(mode)
 
@@ -67,10 +75,60 @@ case class BroadcastExchangeExec[T: ClassTag](
     }
   }
 
-  // Private variable used to hold the reference of RDD created during broadcasting.
+  // Private variable used to hold the reference of RDD created during executor-side broadcasting.
   // If we don't keep its reference, it will be cleaned up.
   private var childRDD: RDD[InternalRow] = null
 
+  private def executorSideBroadcast(): broadcast.Broadcast[Any] = {
+    val beforeBuild = System.nanoTime()
+    // Call persist on the RDD because we want to broadcast the RDD blocks on executors.
+    childRDD = child.execute().mapPartitionsInternal { rowIterator =>
+      rowIterator.map(_.copy())
+    }.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val numOfRows = childRDD.count()
+    if (numOfRows >= 512000000) {
+      throw new SparkException(
+        s"Cannot broadcast the table with more than 512 millions rows: ${numOfRows} rows")
+    }
+
+    // Broadcast the relation on executors.
+    val beforeBroadcast = System.nanoTime()
+    longMetric("buildTime") += (beforeBuild - beforeBroadcast) / 1000000
+
+    val broadcasted = sparkContext.broadcastRDDOnExecutor[InternalRow, T](childRDD, mode)
+      .asInstanceOf[broadcast.Broadcast[Any]]
+
+    longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
+    broadcasted
+  }
+
+  private def driverSideBroadcast(): broadcast.Broadcast[Any] = {
+    val beforeCollect = System.nanoTime()
+    // Note that we use .executeCollect() because we don't want to convert data to
+    // Scala types
+    val input: Array[InternalRow] = child.executeCollect()
+    if (input.length >= 512000000) {
+      throw new SparkException(
+        s"Cannot broadcast the table with more than 512 millions rows: ${input.length} rows")
+    }
+    val beforeBuild = System.nanoTime()
+    longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
+    val dataSize = input.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
+    longMetric("dataSize") += dataSize
+    if (dataSize >= (8L << 30)) {
+      throw new SparkException(
+        s"Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30} GB")
+    }
+    // Construct and broadcast the relation.
+    val relation = mode.transform(input)
+    val beforeBroadcast = System.nanoTime()
+    longMetric("buildTime") += (beforeBroadcast - beforeBuild) / 1000000
+    val broadcasted = sparkContext.broadcast(relation)
+    longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
+    broadcasted
+  }
+
   @transient
   private lazy val relationFuture: Future[broadcast.Broadcast[Any]] = {
     // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here.
@@ -80,27 +138,12 @@ case class BroadcastExchangeExec[T: ClassTag](
       // with the correct execution.
       SQLExecution.withExecutionId(sparkContext, executionId) {
         try {
-          val beforeBuild = System.nanoTime()
-          // Call persist on the RDD because we want to broadcast the RDD blocks on executors.
-          childRDD = child.execute().mapPartitionsInternal { rowIterator =>
-            rowIterator.map(_.copy())
-          }.persist(StorageLevel.MEMORY_AND_DISK)
-
-          val numOfRows = childRDD.count()
-          if (numOfRows >= 512000000) {
-            throw new SparkException(
-              s"Cannot broadcast the table with more than 512 millions rows: ${numOfRows} rows")
+          val broadcasted = if (sqlContext.conf.executorSideBroadcastEnabled) {
+            executorSideBroadcast()
+          } else {
+            driverSideBroadcast()
           }
 
-          // Broadcast the relation on executors.
-          val beforeBroadcast = System.nanoTime()
-          longMetric("buildTime") += (beforeBuild - beforeBroadcast) / 1000000
-
-          val broadcasted = sparkContext.broadcastRDDOnExecutor[InternalRow, T](childRDD,
-            mode).asInstanceOf[broadcast.Broadcast[Any]]
-
-          longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
-
           // There are some cases we don't care about the metrics and call `SparkPlan.doExecute`
           // directly without setting an execution id. We should be tolerant to it.
           if (executionId != null) {
@@ -139,13 +182,6 @@ case class BroadcastExchangeExec[T: ClassTag](
 }
 
 object BroadcastExchangeExec {
-  /*
-  def apply[T: ClassTag](
-      mode: broadcast.BroadcastMode[InternalRow],
-      child: SparkPlan): BroadcastExchangeExec[T] =
-    BroadcastExchangeExec[T](mode, child, implicitly[ClassTag[T]])
-  */
-
   private[execution] val executionContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("broadcast-exchange", 128))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -287,6 +287,16 @@ object SQLConf {
     .intConf
     .createWithDefault(5 * 60)
 
+  val EXECUTOR_SIDE_BROADCAST_ENABLED = SQLConfigBuilder("spark.sql.executorSideBroadcast.enabled")
+    .doc("When true, we will use executor-side broadcast for BroadcastExchangeExec in sql. " +
+         "Notice that broadcasted pieces of data in executor-side broadcast are not persisted " +
+         "in the driver, but fetched from RDD pieces persisted in other executors. " +
+         "If one executor is lost before its piece is fetched by other executors, " +
+         "we can't recover it back and broadcasting will be failed. Thus it is not " +
+         "guaranteed completely safe when using with dynamic allocation.")
+    .booleanConf
+    .createWithDefault(true)
+
   // This is only used for the thriftserver
   val THRIFTSERVER_POOL = SQLConfigBuilder("spark.sql.thriftserver.scheduler.pool")
     .doc("Set a Fair Scheduler pool for a JDBC client session.")
@@ -688,6 +698,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def broadcastTimeout: Int = getConf(BROADCAST_TIMEOUT)
 
+  def executorSideBroadcastEnabled: Boolean = getConf(EXECUTOR_SIDE_BROADCAST_ENABLED)
+
   def defaultDataSourceName: String = getConf(DEFAULT_DATA_SOURCE_NAME)
 
   def convertCTAS: Boolean = getConf(CONVERT_CTAS)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -127,26 +127,30 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
       EnsureRequirements(spark.sessionState.conf).apply(sortMergeJoin)
     }
 
-    test(s"$testName using BroadcastHashJoin (build=left)") {
+    def usingBroadcastHashJoin(buildSide: joins.BuildSide): Unit = {
       extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
         withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
           checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) =>
             makeBroadcastHashJoin(
-              leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildLeft),
+              leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, buildSide),
             expectedAnswer.map(Row.fromTuple),
             sortAnswers = true)
         }
       }
     }
 
+    test(s"$testName using BroadcastHashJoin (build=left)") {
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastHashJoin(joins.BuildLeft)
+        }
+      }
+    }
+
     test(s"$testName using BroadcastHashJoin (build=right)") {
-      extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-          checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) =>
-            makeBroadcastHashJoin(
-              leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildRight),
-            expectedAnswer.map(Row.fromTuple),
-            sortAnswers = true)
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastHashJoin(joins.BuildRight)
         }
       }
     }
@@ -196,21 +200,28 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
       }
     }
 
-    test(s"$testName using BroadcastNestedLoopJoin build left") {
+    def usingBroadcastNestedLoopJoin(buildSide: joins.BuildSide): Unit = {
       withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
         checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-          BroadcastNestedLoopJoinExec(left, right, BuildLeft, Inner, Some(condition())),
+          BroadcastNestedLoopJoinExec(left, right, buildSide, Inner, Some(condition())),
           expectedAnswer.map(Row.fromTuple),
           sortAnswers = true)
       }
     }
 
+    test(s"$testName using BroadcastNestedLoopJoin build left") {
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastNestedLoopJoin(BuildLeft)
+        }
+      }
+    }
+
     test(s"$testName using BroadcastNestedLoopJoin build right") {
-      withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-        checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-          BroadcastNestedLoopJoinExec(left, right, BuildRight, Inner, Some(condition())),
-          expectedAnswer.map(Row.fromTuple),
-          sortAnswers = true)
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastNestedLoopJoin(BuildRight)
+        }
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -92,20 +92,28 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
       }
     }
 
+    def usingBroadcastHashJoin(): Unit = {
+      val buildSide = joinType match {
+        case LeftOuter => BuildRight
+        case RightOuter => BuildLeft
+        case _ => fail(s"Unsupported join type $joinType")
+      }
+      extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            BroadcastHashJoinExec(
+              leftKeys, rightKeys, joinType, buildSide, boundCondition, left, right),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
+        }
+      }
+    }
+
     if (joinType != FullOuter) {
       test(s"$testName using BroadcastHashJoin") {
-        val buildSide = joinType match {
-          case LeftOuter => BuildRight
-          case RightOuter => BuildLeft
-          case _ => fail(s"Unsupported join type $joinType")
-        }
-        extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-              BroadcastHashJoinExec(
-                leftKeys, rightKeys, joinType, buildSide, boundCondition, left, right),
-              expectedAnswer.map(Row.fromTuple),
-              sortAnswers = true)
+        Seq("true", "false").foreach { executorSideBroadcast =>
+          withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+            usingBroadcastHashJoin()
           }
         }
       }
@@ -123,21 +131,28 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
       }
     }
 
-    test(s"$testName using BroadcastNestedLoopJoin build left") {
+    def usingBroadcastNestedLoopJoin(buildSide: BuildSide): Unit = {
       withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
         checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-          BroadcastNestedLoopJoinExec(left, right, BuildLeft, joinType, Some(condition)),
+          BroadcastNestedLoopJoinExec(left, right, buildSide, joinType, Some(condition)),
           expectedAnswer.map(Row.fromTuple),
           sortAnswers = true)
       }
     }
 
+    test(s"$testName using BroadcastNestedLoopJoin build left") {
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastNestedLoopJoin(BuildLeft)
+        }
+      }
+    }
+
     test(s"$testName using BroadcastNestedLoopJoin build right") {
-      withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-        checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-          BroadcastNestedLoopJoinExec(left, right, BuildRight, joinType, Some(condition)),
-          expectedAnswer.map(Row.fromTuple),
-          sortAnswers = true)
+      Seq("true", "false").foreach { executorSideBroadcast =>
+        withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {
+          usingBroadcastNestedLoopJoin(BuildRight)
+        }
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -127,26 +127,30 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {`
`127`	`127`	`EnsureRequirements(spark.sessionState.conf).apply(sortMergeJoin)`
`128`	`128`	`}`
`129`	`129`
`130`		`- test(s"$testName using BroadcastHashJoin (build=left)") {`
	`130`	`+ def usingBroadcastHashJoin(buildSide: joins.BuildSide): Unit = {`
`131`	`131`	`extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>`
`132`	`132`	`withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {`
`133`	`133`	`checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) =>`
`134`	`134`	`makeBroadcastHashJoin(`
`135`		`- leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildLeft),`
	`135`	`+ leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, buildSide),`
`136`	`136`	`expectedAnswer.map(Row.fromTuple),`
`137`	`137`	`sortAnswers = true)`
`138`	`138`	`}`
`139`	`139`	`}`
`140`	`140`	`}`
`141`	`141`
	`142`	`+ test(s"$testName using BroadcastHashJoin (build=left)") {`
	`143`	`+ Seq("true", "false").foreach { executorSideBroadcast =>`
	`144`	`+ withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {`
	`145`	`+ usingBroadcastHashJoin(joins.BuildLeft)`
	`146`	`+ }`
	`147`	`+ }`
	`148`	`+ }`
	`149`	`+`
`142`	`150`	`test(s"$testName using BroadcastHashJoin (build=right)") {`
`143`		`- extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>`
`144`		`- withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {`
`145`		`- checkAnswer2(leftRows, rightRows, (leftPlan: SparkPlan, rightPlan: SparkPlan) =>`
`146`		`- makeBroadcastHashJoin(`
`147`		`- leftKeys, rightKeys, boundCondition, leftPlan, rightPlan, joins.BuildRight),`
`148`		`- expectedAnswer.map(Row.fromTuple),`
`149`		`- sortAnswers = true)`
	`151`	`+ Seq("true", "false").foreach { executorSideBroadcast =>`
	`152`	`+ withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {`
	`153`	`+ usingBroadcastHashJoin(joins.BuildRight)`
`150`	`154`	`}`
`151`	`155`	`}`
`152`	`156`	`}`
`@@ -196,21 +200,28 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {`
`196`	`200`	`}`
`197`	`201`	`}`
`198`	`202`
`199`		`- test(s"$testName using BroadcastNestedLoopJoin build left") {`
	`203`	`+ def usingBroadcastNestedLoopJoin(buildSide: joins.BuildSide): Unit = {`
`200`	`204`	`withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {`
`201`	`205`	`checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>`
`202`		`- BroadcastNestedLoopJoinExec(left, right, BuildLeft, Inner, Some(condition())),`
	`206`	`+ BroadcastNestedLoopJoinExec(left, right, buildSide, Inner, Some(condition())),`
`203`	`207`	`expectedAnswer.map(Row.fromTuple),`
`204`	`208`	`sortAnswers = true)`
`205`	`209`	`}`
`206`	`210`	`}`
`207`	`211`
	`212`	`+ test(s"$testName using BroadcastNestedLoopJoin build left") {`
	`213`	`+ Seq("true", "false").foreach { executorSideBroadcast =>`
	`214`	`+ withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {`
	`215`	`+ usingBroadcastNestedLoopJoin(BuildLeft)`
	`216`	`+ }`
	`217`	`+ }`
	`218`	`+ }`
	`219`	`+`
`208`	`220`	`test(s"$testName using BroadcastNestedLoopJoin build right") {`
`209`		`- withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {`
`210`		`- checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>`
`211`		`- BroadcastNestedLoopJoinExec(left, right, BuildRight, Inner, Some(condition())),`
`212`		`- expectedAnswer.map(Row.fromTuple),`
`213`		`- sortAnswers = true)`
	`221`	`+ Seq("true", "false").foreach { executorSideBroadcast =>`
	`222`	`+ withSQLConf(SQLConf.EXECUTOR_SIDE_BROADCAST_ENABLED.key -> executorSideBroadcast) {`
	`223`	`+ usingBroadcastNestedLoopJoin(BuildRight)`
	`224`	`+ }`
`214`	`225`	`}`
`215`	`226`	`}`
`216`	`227`	`}`