@@ -45,9 +45,17 @@ case class BroadcastExchangeExec[T: ClassTag](
4545 mode : broadcast.BroadcastMode [InternalRow ],
4646 child : SparkPlan ) extends Exchange {
4747
48- override lazy val metrics = Map (
49- " buildTime" -> SQLMetrics .createMetric(sparkContext, " time to build (ms)" ),
50- " broadcastTime" -> SQLMetrics .createMetric(sparkContext, " time to broadcast (ms)" ))
48+ override lazy val metrics = if (sqlContext.conf.executorSideBroadcastEnabled) {
49+ Map (
50+ " buildTime" -> SQLMetrics .createMetric(sparkContext, " time to build (ms)" ),
51+ " broadcastTime" -> SQLMetrics .createMetric(sparkContext, " time to broadcast (ms)" ))
52+ } else {
53+ Map (
54+ " dataSize" -> SQLMetrics .createMetric(sparkContext, " data size (bytes)" ),
55+ " collectTime" -> SQLMetrics .createMetric(sparkContext, " time to collect (ms)" ),
56+ " buildTime" -> SQLMetrics .createMetric(sparkContext, " time to build (ms)" ),
57+ " broadcastTime" -> SQLMetrics .createMetric(sparkContext, " time to broadcast (ms)" ))
58+ }
5159
5260 override def outputPartitioning : Partitioning = BroadcastPartitioning (mode)
5361
@@ -67,10 +75,60 @@ case class BroadcastExchangeExec[T: ClassTag](
6775 }
6876 }
6977
70- // Private variable used to hold the reference of RDD created during broadcasting.
78+ // Private variable used to hold the reference of RDD created during executor-side broadcasting.
7179 // If we don't keep its reference, it will be cleaned up.
7280 private var childRDD : RDD [InternalRow ] = null
7381
82+ private def executorSideBroadcast (): broadcast.Broadcast [Any ] = {
83+ val beforeBuild = System .nanoTime()
84+ // Call persist on the RDD because we want to broadcast the RDD blocks on executors.
85+ childRDD = child.execute().mapPartitionsInternal { rowIterator =>
86+ rowIterator.map(_.copy())
87+ }.persist(StorageLevel .MEMORY_AND_DISK )
88+
89+ val numOfRows = childRDD.count()
90+ if (numOfRows >= 512000000 ) {
91+ throw new SparkException (
92+ s " Cannot broadcast the table with more than 512 millions rows: ${numOfRows} rows " )
93+ }
94+
95+ // Broadcast the relation on executors.
96+ val beforeBroadcast = System .nanoTime()
97+ longMetric(" buildTime" ) += (beforeBuild - beforeBroadcast) / 1000000
98+
99+ val broadcasted = sparkContext.broadcastRDDOnExecutor[InternalRow , T ](childRDD, mode)
100+ .asInstanceOf [broadcast.Broadcast [Any ]]
101+
102+ longMetric(" broadcastTime" ) += (System .nanoTime() - beforeBroadcast) / 1000000
103+ broadcasted
104+ }
105+
106+ private def driverSideBroadcast (): broadcast.Broadcast [Any ] = {
107+ val beforeCollect = System .nanoTime()
108+ // Note that we use .executeCollect() because we don't want to convert data to
109+ // Scala types
110+ val input : Array [InternalRow ] = child.executeCollect()
111+ if (input.length >= 512000000 ) {
112+ throw new SparkException (
113+ s " Cannot broadcast the table with more than 512 millions rows: ${input.length} rows " )
114+ }
115+ val beforeBuild = System .nanoTime()
116+ longMetric(" collectTime" ) += (beforeBuild - beforeCollect) / 1000000
117+ val dataSize = input.map(_.asInstanceOf [UnsafeRow ].getSizeInBytes.toLong).sum
118+ longMetric(" dataSize" ) += dataSize
119+ if (dataSize >= (8L << 30 )) {
120+ throw new SparkException (
121+ s " Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30 } GB " )
122+ }
123+ // Construct and broadcast the relation.
124+ val relation = mode.transform(input)
125+ val beforeBroadcast = System .nanoTime()
126+ longMetric(" buildTime" ) += (beforeBroadcast - beforeBuild) / 1000000
127+ val broadcasted = sparkContext.broadcast(relation)
128+ longMetric(" broadcastTime" ) += (System .nanoTime() - beforeBroadcast) / 1000000
129+ broadcasted
130+ }
131+
74132 @ transient
75133 private lazy val relationFuture : Future [broadcast.Broadcast [Any ]] = {
76134 // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here.
@@ -80,27 +138,12 @@ case class BroadcastExchangeExec[T: ClassTag](
80138 // with the correct execution.
81139 SQLExecution .withExecutionId(sparkContext, executionId) {
82140 try {
83- val beforeBuild = System .nanoTime()
84- // Call persist on the RDD because we want to broadcast the RDD blocks on executors.
85- childRDD = child.execute().mapPartitionsInternal { rowIterator =>
86- rowIterator.map(_.copy())
87- }.persist(StorageLevel .MEMORY_AND_DISK )
88-
89- val numOfRows = childRDD.count()
90- if (numOfRows >= 512000000 ) {
91- throw new SparkException (
92- s " Cannot broadcast the table with more than 512 millions rows: ${numOfRows} rows " )
141+ val broadcasted = if (sqlContext.conf.executorSideBroadcastEnabled) {
142+ executorSideBroadcast()
143+ } else {
144+ driverSideBroadcast()
93145 }
94146
95- // Broadcast the relation on executors.
96- val beforeBroadcast = System .nanoTime()
97- longMetric(" buildTime" ) += (beforeBuild - beforeBroadcast) / 1000000
98-
99- val broadcasted = sparkContext.broadcastRDDOnExecutor[InternalRow , T ](childRDD,
100- mode).asInstanceOf [broadcast.Broadcast [Any ]]
101-
102- longMetric(" broadcastTime" ) += (System .nanoTime() - beforeBroadcast) / 1000000
103-
104147 // There are some cases we don't care about the metrics and call `SparkPlan.doExecute`
105148 // directly without setting an execution id. We should be tolerant to it.
106149 if (executionId != null ) {
@@ -139,13 +182,6 @@ case class BroadcastExchangeExec[T: ClassTag](
139182}
140183
141184object BroadcastExchangeExec {
142- /*
143- def apply[T: ClassTag](
144- mode: broadcast.BroadcastMode[InternalRow],
145- child: SparkPlan): BroadcastExchangeExec[T] =
146- BroadcastExchangeExec[T](mode, child, implicitly[ClassTag[T]])
147- */
148-
149185 private [execution] val executionContext = ExecutionContext .fromExecutorService(
150186 ThreadUtils .newDaemonCachedThreadPool(" broadcast-exchange" , 128 ))
151187}
0 commit comments