Skip to content

Commit ad3bdad

Browse files
committed
revert spark 21052 in spark 2.3 branch
1 parent 96a5a12 commit ad3bdad

File tree

5 files changed

+6
-160
lines changed

5 files changed

+6
-160
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ case class BroadcastHashJoinExec(
4747
extends BinaryExecNode with HashJoin with CodegenSupport {
4848

4949
override lazy val metrics = Map(
50-
"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
51-
"avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
50+
"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
5251

5352
override def requiredChildDistribution: Seq[Distribution] = {
5453
val mode = HashedRelationBroadcastMode(buildKeys)
@@ -62,13 +61,12 @@ case class BroadcastHashJoinExec(
6261

6362
protected override def doExecute(): RDD[InternalRow] = {
6463
val numOutputRows = longMetric("numOutputRows")
65-
val avgHashProbe = longMetric("avgHashProbe")
6664

6765
val broadcastRelation = buildPlan.executeBroadcast[HashedRelation]()
6866
streamedPlan.execute().mapPartitions { streamedIter =>
6967
val hashed = broadcastRelation.value.asReadOnlyCopy()
7068
TaskContext.get().taskMetrics().incPeakExecutionMemory(hashed.estimatedSize)
71-
join(streamedIter, hashed, numOutputRows, avgHashProbe)
69+
join(streamedIter, hashed, numOutputRows)
7270
}
7371
}
7472

@@ -110,23 +108,6 @@ case class BroadcastHashJoinExec(
110108
}
111109
}
112110

113-
/**
114-
* Returns the codes used to add a task completion listener to update avg hash probe
115-
* at the end of the task.
116-
*/
117-
private def genTaskListener(avgHashProbe: String, relationTerm: String): String = {
118-
val listenerClass = classOf[TaskCompletionListener].getName
119-
val taskContextClass = classOf[TaskContext].getName
120-
s"""
121-
| $taskContextClass$$.MODULE$$.get().addTaskCompletionListener(new $listenerClass() {
122-
| @Override
123-
| public void onTaskCompletion($taskContextClass context) {
124-
| $avgHashProbe.set($relationTerm.getAverageProbesPerLookup());
125-
| }
126-
| });
127-
""".stripMargin
128-
}
129-
130111
/**
131112
* Returns a tuple of Broadcast of HashedRelation and the variable name for it.
132113
*/
@@ -144,7 +125,6 @@ case class BroadcastHashJoinExec(
144125
v => s"""
145126
| $v = (($clsName) $broadcast.value()).asReadOnlyCopy();
146127
| incPeakExecutionMemory($v.estimatedSize());
147-
| ${genTaskListener(avgHashProbe, v)}
148128
""".stripMargin, forceInline = true)
149129
(broadcastRelation, relationTerm)
150130
}

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,7 @@ trait HashJoin {
194194
protected def join(
195195
streamedIter: Iterator[InternalRow],
196196
hashed: HashedRelation,
197-
numOutputRows: SQLMetric,
198-
avgHashProbe: SQLMetric): Iterator[InternalRow] = {
197+
numOutputRows: SQLMetric): Iterator[InternalRow] = {
199198

200199
val joinedIter = joinType match {
201200
case _: InnerLike =>
@@ -213,10 +212,6 @@ trait HashJoin {
213212
s"BroadcastHashJoin should not take $x as the JoinType")
214213
}
215214

216-
// At the end of the task, we update the avg hash probe.
217-
TaskContext.get().addTaskCompletionListener(_ =>
218-
avgHashProbe.set(hashed.getAverageProbesPerLookup))
219-
220215
val resultProj = createResultProjection
221216
joinedIter.map { r =>
222217
numOutputRows += 1

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,6 @@ private[execution] sealed trait HashedRelation extends KnownSizeEstimation {
8080
* Release any used resources.
8181
*/
8282
def close(): Unit
83-
84-
/**
85-
* Returns the average number of probes per key lookup.
86-
*/
87-
def getAverageProbesPerLookup: Double
8883
}
8984

9085
private[execution] object HashedRelation {
@@ -280,8 +275,6 @@ private[joins] class UnsafeHashedRelation(
280275
override def read(kryo: Kryo, in: Input): Unit = Utils.tryOrIOException {
281276
read(() => in.readInt(), () => in.readLong(), in.readBytes)
282277
}
283-
284-
override def getAverageProbesPerLookup: Double = binaryMap.getAverageProbesPerLookup
285278
}
286279

287280
private[joins] object UnsafeHashedRelation {
@@ -395,10 +388,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
395388
// The number of unique keys.
396389
private var numKeys = 0L
397390

398-
// Tracking average number of probes per key lookup.
399-
private var numKeyLookups = 0L
400-
private var numProbes = 0L
401-
402391
// needed by serializer
403392
def this() = {
404393
this(
@@ -483,8 +472,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
483472
*/
484473
def getValue(key: Long, resultRow: UnsafeRow): UnsafeRow = {
485474
if (isDense) {
486-
numKeyLookups += 1
487-
numProbes += 1
488475
if (key >= minKey && key <= maxKey) {
489476
val value = array((key - minKey).toInt)
490477
if (value > 0) {
@@ -493,14 +480,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
493480
}
494481
} else {
495482
var pos = firstSlot(key)
496-
numKeyLookups += 1
497-
numProbes += 1
498483
while (array(pos + 1) != 0) {
499484
if (array(pos) == key) {
500485
return getRow(array(pos + 1), resultRow)
501486
}
502487
pos = nextSlot(pos)
503-
numProbes += 1
504488
}
505489
}
506490
null
@@ -528,8 +512,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
528512
*/
529513
def get(key: Long, resultRow: UnsafeRow): Iterator[UnsafeRow] = {
530514
if (isDense) {
531-
numKeyLookups += 1
532-
numProbes += 1
533515
if (key >= minKey && key <= maxKey) {
534516
val value = array((key - minKey).toInt)
535517
if (value > 0) {
@@ -538,14 +520,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
538520
}
539521
} else {
540522
var pos = firstSlot(key)
541-
numKeyLookups += 1
542-
numProbes += 1
543523
while (array(pos + 1) != 0) {
544524
if (array(pos) == key) {
545525
return valueIter(array(pos + 1), resultRow)
546526
}
547527
pos = nextSlot(pos)
548-
numProbes += 1
549528
}
550529
}
551530
null
@@ -585,11 +564,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
585564
private def updateIndex(key: Long, address: Long): Unit = {
586565
var pos = firstSlot(key)
587566
assert(numKeys < array.length / 2)
588-
numKeyLookups += 1
589-
numProbes += 1
590567
while (array(pos) != key && array(pos + 1) != 0) {
591568
pos = nextSlot(pos)
592-
numProbes += 1
593569
}
594570
if (array(pos + 1) == 0) {
595571
// this is the first value for this key, put the address in array.
@@ -721,8 +697,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
721697
writeLong(maxKey)
722698
writeLong(numKeys)
723699
writeLong(numValues)
724-
writeLong(numKeyLookups)
725-
writeLong(numProbes)
726700

727701
writeLong(array.length)
728702
writeLongArray(writeBuffer, array, array.length)
@@ -764,8 +738,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
764738
maxKey = readLong()
765739
numKeys = readLong()
766740
numValues = readLong()
767-
numKeyLookups = readLong()
768-
numProbes = readLong()
769741

770742
val length = readLong().toInt
771743
mask = length - 2
@@ -783,11 +755,6 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
783755
override def read(kryo: Kryo, in: Input): Unit = {
784756
read(() => in.readBoolean(), () => in.readLong(), in.readBytes)
785757
}
786-
787-
/**
788-
* Returns the average number of probes per key lookup.
789-
*/
790-
def getAverageProbesPerLookup: Double = numProbes.toDouble / numKeyLookups
791758
}
792759

793760
private[joins] class LongHashedRelation(
@@ -839,8 +806,6 @@ private[joins] class LongHashedRelation(
839806
resultRow = new UnsafeRow(nFields)
840807
map = in.readObject().asInstanceOf[LongToUnsafeRowMap]
841808
}
842-
843-
override def getAverageProbesPerLookup: Double = map.getAverageProbesPerLookup
844809
}
845810

846811
/**

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ case class ShuffledHashJoinExec(
4242
override lazy val metrics = Map(
4343
"numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
4444
"buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
45-
"buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"),
46-
"avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
45+
"buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))
4746

4847
override def requiredChildDistribution: Seq[Distribution] =
4948
HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
@@ -63,10 +62,9 @@ case class ShuffledHashJoinExec(
6362

6463
protected override def doExecute(): RDD[InternalRow] = {
6564
val numOutputRows = longMetric("numOutputRows")
66-
val avgHashProbe = longMetric("avgHashProbe")
6765
streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
6866
val hashed = buildHashedRelation(buildIter)
69-
join(streamIter, hashed, numOutputRows, avgHashProbe)
67+
join(streamIter, hashed, numOutputRows)
7068
}
7169
}
7270
}

sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala

Lines changed: 1 addition & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -231,50 +231,6 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
231231
)
232232
}
233233

234-
test("BroadcastHashJoin metrics: track avg probe") {
235-
// The executed plan looks like:
236-
// Project [a#210, b#211, b#221]
237-
// +- BroadcastHashJoin [a#210], [a#220], Inner, BuildRight
238-
// :- Project [_1#207 AS a#210, _2#208 AS b#211]
239-
// : +- Filter isnotnull(_1#207)
240-
// : +- LocalTableScan [_1#207, _2#208]
241-
// +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, binary, true]))
242-
// +- Project [_1#217 AS a#220, _2#218 AS b#221]
243-
// +- Filter isnotnull(_1#217)
244-
// +- LocalTableScan [_1#217, _2#218]
245-
//
246-
// Assume the execution plan with node id is
247-
// WholeStageCodegen disabled:
248-
// Project(nodeId = 0)
249-
// BroadcastHashJoin(nodeId = 1)
250-
// ...(ignored)
251-
//
252-
// WholeStageCodegen enabled:
253-
// WholeStageCodegen(nodeId = 0)
254-
// Project(nodeId = 1)
255-
// BroadcastHashJoin(nodeId = 2)
256-
// Project(nodeId = 3)
257-
// Filter(nodeId = 4)
258-
// ...(ignored)
259-
Seq(true, false).foreach { enableWholeStage =>
260-
val df1 = generateRandomBytesDF()
261-
val df2 = generateRandomBytesDF()
262-
val df = df1.join(broadcast(df2), "a")
263-
val nodeIds = if (enableWholeStage) {
264-
Set(2L)
265-
} else {
266-
Set(1L)
267-
}
268-
val metrics = getSparkPlanMetrics(df, 2, nodeIds, enableWholeStage).get
269-
nodeIds.foreach { nodeId =>
270-
val probes = metrics(nodeId)._2("avg hash probe (min, med, max)")
271-
probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
272-
assert(probe.toDouble > 1.0)
273-
}
274-
}
275-
}
276-
}
277-
278234
test("ShuffledHashJoin metrics") {
279235
withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "40",
280236
"spark.sql.shuffle.partitions" -> "2",
@@ -287,59 +243,11 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
287243
val metrics = getSparkPlanMetrics(df, 1, Set(1L))
288244
testSparkPlanMetrics(df, 1, Map(
289245
1L -> (("ShuffledHashJoin", Map(
290-
"number of output rows" -> 2L,
291-
"avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))))
246+
"number of output rows" -> 2L))))
292247
)
293248
}
294249
}
295250

296-
test("ShuffledHashJoin metrics: track avg probe") {
297-
// The executed plan looks like:
298-
// Project [a#308, b#309, b#319]
299-
// +- ShuffledHashJoin [a#308], [a#318], Inner, BuildRight
300-
// :- Exchange hashpartitioning(a#308, 2)
301-
// : +- Project [_1#305 AS a#308, _2#306 AS b#309]
302-
// : +- Filter isnotnull(_1#305)
303-
// : +- LocalTableScan [_1#305, _2#306]
304-
// +- Exchange hashpartitioning(a#318, 2)
305-
// +- Project [_1#315 AS a#318, _2#316 AS b#319]
306-
// +- Filter isnotnull(_1#315)
307-
// +- LocalTableScan [_1#315, _2#316]
308-
//
309-
// Assume the execution plan with node id is
310-
// WholeStageCodegen disabled:
311-
// Project(nodeId = 0)
312-
// ShuffledHashJoin(nodeId = 1)
313-
// ...(ignored)
314-
//
315-
// WholeStageCodegen enabled:
316-
// WholeStageCodegen(nodeId = 0)
317-
// Project(nodeId = 1)
318-
// ShuffledHashJoin(nodeId = 2)
319-
// ...(ignored)
320-
withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "5000000",
321-
"spark.sql.shuffle.partitions" -> "2",
322-
"spark.sql.join.preferSortMergeJoin" -> "false") {
323-
Seq(true, false).foreach { enableWholeStage =>
324-
val df1 = generateRandomBytesDF(65535 * 5)
325-
val df2 = generateRandomBytesDF(65535)
326-
val df = df1.join(df2, "a")
327-
val nodeIds = if (enableWholeStage) {
328-
Set(2L)
329-
} else {
330-
Set(1L)
331-
}
332-
val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get
333-
nodeIds.foreach { nodeId =>
334-
val probes = metrics(nodeId)._2("avg hash probe (min, med, max)")
335-
probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
336-
assert(probe.toDouble > 1.0)
337-
}
338-
}
339-
}
340-
}
341-
}
342-
343251
test("BroadcastHashJoin(outer) metrics") {
344252
val df1 = Seq((1, "a"), (1, "b"), (4, "c")).toDF("key", "value")
345253
val df2 = Seq((1, "a"), (1, "b"), (2, "c"), (3, "d")).toDF("key2", "value")

0 commit comments

Comments
 (0)