-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-1021] Defer the data-driven computation of partition bounds in so... #1689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac67195
7143f97
ca8913e
b2b20e8
b88b5d4
4e334a9
50b6da6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,10 @@ import org.apache.spark.serializer.JavaSerializer | |
| import org.apache.spark.util.{CollectionsUtils, Utils} | ||
| import org.apache.spark.util.random.{XORShiftRandom, SamplingUtils} | ||
|
|
||
| import org.apache.spark.SparkContext.rddToAsyncRDDActions | ||
| import scala.concurrent.Await | ||
| import scala.concurrent.duration.Duration | ||
|
|
||
| /** | ||
| * An object that defines how the elements in a key-value pair RDD are partitioned by key. | ||
| * Maps each key to a partition ID, from 0 to `numPartitions - 1`. | ||
|
|
@@ -113,8 +117,12 @@ class RangePartitioner[K : Ordering : ClassTag, V]( | |
| private var ordering = implicitly[Ordering[K]] | ||
|
|
||
| // An array of upper bounds for the first (partitions - 1) partitions | ||
| private var rangeBounds: Array[K] = { | ||
| if (partitions <= 1) { | ||
| @volatile private var valRB: Array[K] = null | ||
|
|
||
| private def rangeBounds: Array[K] = this.synchronized { | ||
| if (valRB != null) return valRB | ||
|
|
||
| valRB = if (partitions <= 1) { | ||
| Array.empty | ||
| } else { | ||
| // This is the sample size we need to have roughly balanced output partitions, capped at 1M. | ||
|
|
@@ -152,6 +160,8 @@ class RangePartitioner[K : Ordering : ClassTag, V]( | |
| RangePartitioner.determineBounds(candidates, partitions) | ||
| } | ||
| } | ||
|
|
||
| valRB | ||
| } | ||
|
|
||
| def numPartitions = rangeBounds.length + 1 | ||
|
|
@@ -222,7 +232,8 @@ class RangePartitioner[K : Ordering : ClassTag, V]( | |
| } | ||
|
|
||
| @throws(classOf[IOException]) | ||
| private def readObject(in: ObjectInputStream) { | ||
| private def readObject(in: ObjectInputStream): Unit = this.synchronized { | ||
| if (valRB != null) return | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we not want to deserialize valRB if it is not null? Are you worried rangeBounds might be called while the deserialization is happening?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also was assuming readObject might be called in multiple threads. Can that happen?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's not possible |
||
| val sfactory = SparkEnv.get.serializer | ||
| sfactory match { | ||
| case js: JavaSerializer => in.defaultReadObject() | ||
|
|
@@ -234,7 +245,7 @@ class RangePartitioner[K : Ordering : ClassTag, V]( | |
| val ser = sfactory.newInstance() | ||
| Utils.deserializeViaNestedStream(in, ser) { ds => | ||
| implicit val classTag = ds.readObject[ClassTag[Array[K]]]() | ||
| rangeBounds = ds.readObject[Array[K]]() | ||
| valRB = ds.readObject[Array[K]]() | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -254,12 +265,18 @@ private[spark] object RangePartitioner { | |
| sampleSizePerPartition: Int): (Long, Array[(Int, Int, Array[K])]) = { | ||
| val shift = rdd.id | ||
| // val classTagK = classTag[K] // to avoid serializing the entire partitioner object | ||
| val sketched = rdd.mapPartitionsWithIndex { (idx, iter) => | ||
| // use collectAsync here to run this job as a future, which is cancellable | ||
| val sketchFuture = rdd.mapPartitionsWithIndex { (idx, iter) => | ||
| val seed = byteswap32(idx ^ (shift << 16)) | ||
| val (sample, n) = SamplingUtils.reservoirSampleAndCount( | ||
| iter, sampleSizePerPartition, seed) | ||
| Iterator((idx, n, sample)) | ||
| }.collect() | ||
| }.collectAsync() | ||
| // We do need the future's value to continue any further | ||
| val sketched = Await.ready(sketchFuture, Duration.Inf).value.get match { | ||
| case scala.util.Success(v) => v.toArray | ||
| case scala.util.Failure(e) => throw e | ||
| } | ||
| val numItems = sketched.map(_._2.toLong).sum | ||
| (numItems, sketched) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any idea on volatile's impact on read performance? rangeBounds is read multiple times in getPartition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It wouldn't surprise me if this performance figure varied with different combinations of hardware and Java version; but for at least one such combination, volatile reads are roughly 2-3x as costly as non-volatile reads as long as they are uncontended -- much more expensive when there are concurrent writes to contend with. http://brooker.co.za/blog/2012/09/10/volatile.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding of getPartitions was that it executes once, and is therefore "allowed to be expensive". Also, isn't rangeBounds generally only returning a reference to the array? (except for the first time, where it's computed)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is going to be called once for every record on workers actually.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we can rename valRB to _rangeBounds and use this directly in getPartition?