[SPARK-4964] refactor to add preferredLocations. depends on SPARK-4014

koeninger · koeninger · commit e93eb72c9d35 · 2015-01-09T17:29:28.000-06:00
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaCluster.scala
@@ -32,7 +32,7 @@ import kafka.consumer.{ConsumerConfig, SimpleConsumer}
   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form
   */
 class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
-  import KafkaCluster.Err
+  import KafkaCluster.{Err, LeaderOffset}
 
   val seedBrokers: Array[(String, Int)] =
     kafkaParams.get("metadata.broker.list")
@@ -131,18 +131,18 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
 
   def getLatestLeaderOffsets(
     topicAndPartitions: Set[TopicAndPartition]
-  ): Either[Err, Map[TopicAndPartition, Long]] =
+  ): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
     getLeaderOffsets(topicAndPartitions, OffsetRequest.LatestTime)
 
   def getEarliestLeaderOffsets(
     topicAndPartitions: Set[TopicAndPartition]
-  ): Either[Err, Map[TopicAndPartition, Long]] =
+  ): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
     getLeaderOffsets(topicAndPartitions, OffsetRequest.EarliestTime)
 
   def getLeaderOffsets(
     topicAndPartitions: Set[TopicAndPartition],
     before: Long
-  ): Either[Err, Map[TopicAndPartition, Long]] =
+  ): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
     getLeaderOffsets(topicAndPartitions, before, 1).right.map { r =>
       r.map { kv =>
         // mapValues isnt serializable, see SI-7005
@@ -159,11 +159,11 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     topicAndPartitions: Set[TopicAndPartition],
     before: Long,
     maxNumOffsets: Int
-  ): Either[Err, Map[TopicAndPartition, Seq[Long]]] = {
+  ): Either[Err, Map[TopicAndPartition, Seq[LeaderOffset]]] = {
     findLeaders(topicAndPartitions).right.flatMap { tpToLeader =>
       val leaderToTp: Map[(String, Int), Seq[TopicAndPartition]] = flip(tpToLeader)
       val leaders = leaderToTp.keys
-      var result = Map[TopicAndPartition, Seq[Long]]()
+      var result = Map[TopicAndPartition, Seq[LeaderOffset]]()
       val errs = new Err
       withBrokers(leaders, errs) { consumer =>
         val needed: Seq[TopicAndPartition] = leaderToTp((consumer.host, consumer.port))
@@ -178,7 +178,9 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
           respMap.get(tp).foreach { errAndOffsets =>
             if (errAndOffsets.error == ErrorMapping.NoError) {
               if (errAndOffsets.offsets.nonEmpty) {
-                result += tp -> errAndOffsets.offsets
+                result += tp -> errAndOffsets.offsets.map { off =>
+                  LeaderOffset(consumer.host, consumer.port, off)
+                }
               } else {
                 errs.append(new Exception(
                   s"Empty offsets for ${tp}, is ${before} before log beginning?"))
@@ -297,6 +299,8 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
 object KafkaCluster {
   type Err = ArrayBuffer[Throwable]
 
+  case class LeaderOffset(host: String, port: Int, offset: Long)
+
   /** Make a consumer config without requiring group.id or zookeeper.connect,
     * since communicating with brokers also needs common settings such as timeout
     */
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala
@@ -31,16 +31,24 @@ import kafka.message.{MessageAndMetadata, MessageAndOffset}
 import kafka.serializer.Decoder
 import kafka.utils.VerifiableProperties
 
+
 case class KafkaRDDPartition(
   override val index: Int,
+  /** kafka topic name */
   topic: String,
+  /** kafka partition id */
   partition: Int,
+  /** inclusive starting offset */
   fromOffset: Long,
-  untilOffset: Long
+  /** exclusive ending offset */
+  untilOffset: Long,
+  /** preferred kafka host, i.e. the leader at the time the rdd was created */
+  host: String,
+  /** preferred kafka host's port */
+  port: Int
 ) extends Partition
 
 /** A batch-oriented interface for consuming from Kafka.
-  * Each given Kafka topic/partition corresponds to an RDD partition.
   * Starting and ending offsets are specified in advance,
   * so that you can control exactly-once semantics.
   * For an easy interface to Kafka-managed offsets,
@@ -49,10 +57,8 @@ case class KafkaRDDPartition(
   * configuration parameters</a>.
   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
-  * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
-  *  starting point of the batch
-  * @param untilOffsets per-topic/partition Kafka offsets defining the (exclusive)
-  *  ending point of the batch
+  * @param rddPartitions Each RDD partition corresponds to a
+  *   range of offsets for a given Kafka topic/partition
   * @param messageHandler function for translating each message into the desired type
   */
 class KafkaRDD[
@@ -63,20 +69,31 @@ class KafkaRDD[
   R: ClassTag](
     sc: SparkContext,
     val kafkaParams: Map[String, String],
-    val fromOffsets: Map[TopicAndPartition, Long],
-    val untilOffsets: Map[TopicAndPartition, Long],
+    val rddPartitions: Traversable[KafkaRDDPartition],
     messageHandler: MessageAndMetadata[K, V] => R
   ) extends RDD[R](sc, Nil) with Logging {
 
-  assert(fromOffsets.keys == untilOffsets.keys,
-    "Must provide both from and until offsets for each topic/partition")
+  /** per-topic/partition Kafka offsets defining the (inclusive) starting point of the batch */
+  def fromOffsets: Map[TopicAndPartition, Long] =
+    rddPartitions.map { kr =>
+      TopicAndPartition(kr.topic, kr.partition) -> kr.fromOffset
+    }.toMap
+
+  /** per-topic/partition Kafka offsets defining the (exclusive) ending point of the batch */
+  def untilOffsets: Map[TopicAndPartition, Long] =
+    rddPartitions.map { kr =>
+      TopicAndPartition(kr.topic, kr.partition) -> kr.untilOffset
+    }.toMap
+
+  override def getPartitions: Array[Partition] = rddPartitions.toArray
 
-  override def getPartitions: Array[Partition] = fromOffsets.zipWithIndex.map { kvi =>
-    val ((tp, from), index) = kvi
-    new KafkaRDDPartition(index, tp.topic, tp.partition, from, untilOffsets(tp))
-  }.toArray
+  override def getPreferredLocations(thePart: Partition): Seq[String] = {
+    val part = thePart.asInstanceOf[KafkaRDDPartition]
+    // TODO is additional hostname resolution necessary here
+    Seq(part.host)
+  }
 
-  override def compute(thePart: Partition, context: TaskContext) = {
+  override def compute(thePart: Partition, context: TaskContext): Iterator[R] = {
     val part = thePart.asInstanceOf[KafkaRDDPartition]
     if (part.fromOffset >= part.untilOffset) {
       log.warn("Beginning offset is same or after ending offset " +
@@ -86,25 +103,37 @@ class KafkaRDD[
       new NextIterator[R] {
         context.addTaskCompletionListener{ context => closeIfNeeded() }
 
-        val kc = new KafkaCluster(kafkaParams)
         log.info(s"Computing topic ${part.topic}, partition ${part.partition} " +
           s"offsets ${part.fromOffset} -> ${part.untilOffset}")
+
+        val kc = new KafkaCluster(kafkaParams)
         val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
           .newInstance(kc.config.props)
           .asInstanceOf[Decoder[K]]
         val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
           .newInstance(kc.config.props)
           .asInstanceOf[Decoder[V]]
-        val consumer: SimpleConsumer = kc.connectLeader(part.topic, part.partition).fold(
-          errs => throw new Exception(
-            s"Couldn't connect to leader for topic ${part.topic} ${part.partition}: " +
-              errs.mkString("\n")),
-          consumer => consumer
-        )
+        val consumer = connectLeader
         var requestOffset = part.fromOffset
         var iter: Iterator[MessageAndOffset] = null
 
-        def handleErr(resp: FetchResponse) {
+        // TODO broken until SPARK-4014 is resolved and attemptId / attemptNumber is meaningful.
+        // The idea is to use the provided preferred host, except on task retry atttempts,
+        // to minimize number of kafka metadata requests
+        private def connectLeader: SimpleConsumer = {
+          if (context.attemptId > 0) {
+            kc.connectLeader(part.topic, part.partition).fold(
+              errs => throw new Exception(
+                s"Couldn't connect to leader for topic ${part.topic} ${part.partition}: " +
+                  errs.mkString("\n")),
+              consumer => consumer
+            )
+          } else {
+            kc.connect(part.host, part.port)
+          }
+        }
+
+        private def handleErr(resp: FetchResponse) {
           if (resp.hasError) {
             val err = resp.errorCode(part.topic, part.partition)
             if (err == ErrorMapping.LeaderNotAvailableCode ||
@@ -160,3 +189,41 @@ class KafkaRDD[
   }
 
 }
+
+object KafkaRDD {
+  import KafkaCluster.LeaderOffset
+
+  /**
+    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+    * configuration parameters</a>.
+    *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
+    *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+    * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
+    *  starting point of the batch
+    * @param untilOffsets per-topic/partition Kafka offsets defining the (exclusive)
+    *  ending point of the batch
+    * @param messageHandler function for translating each message into the desired type
+    */
+  def apply[
+    K: ClassTag,
+    V: ClassTag,
+    U <: Decoder[_]: ClassTag,
+    T <: Decoder[_]: ClassTag,
+    R: ClassTag](
+      sc: SparkContext,
+      kafkaParams: Map[String, String],
+      fromOffsets: Map[TopicAndPartition, Long],
+      untilOffsets: Map[TopicAndPartition, LeaderOffset],
+      messageHandler: MessageAndMetadata[K, V] => R
+  ): KafkaRDD[K, V, U, T, R] = {
+    assert(fromOffsets.keys == untilOffsets.keys,
+      "Must provide both from and until offsets for each topic/partition")
+
+    val partitions  = fromOffsets.zipWithIndex.map { case ((tp, from), index) =>
+      val lo = untilOffsets(tp)
+      new KafkaRDDPartition(index, tp.topic, tp.partition, from, lo.offset, lo.host, lo.port)
+    }
+
+    new KafkaRDD[K, V, U, T, R](sc, kafkaParams, partitions, messageHandler)
+  }
+}
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala
@@ -27,6 +27,7 @@ import kafka.serializer.Decoder
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.kafka.{KafkaCluster, KafkaRDD}
+import org.apache.spark.rdd.kafka.KafkaCluster.LeaderOffset
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.dstream._
 
@@ -76,7 +77,7 @@ class DeterministicKafkaInputDStream[
   private var currentOffsets = fromOffsets
 
   @tailrec
-  private def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, Long] = {
+  private def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
     val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
     // Either.fold would confuse @tailrec, do it manually
     if (o.isLeft) {
@@ -93,20 +94,21 @@ class DeterministicKafkaInputDStream[
     }
   }
 
-  private def clamp(leaderOffsets: Map[TopicAndPartition, Long]): Map[TopicAndPartition, Long] = {
+  private def clamp(
+    leaderOffsets: Map[TopicAndPartition, LeaderOffset]): Map[TopicAndPartition, LeaderOffset] = {
     maxMessagesPerPartition.map { mmp =>
-      leaderOffsets.map { kv =>
-        kv._1 -> Math.min(currentOffsets(kv._1) + mmp, kv._2)
+      leaderOffsets.map { case (tp, lo) =>
+        tp -> lo.copy(offset = Math.min(currentOffsets(tp) + mmp, lo.offset))
       }
     }.getOrElse(leaderOffsets)
   }
 
   override def compute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = {
     val untilOffsets = clamp(latestLeaderOffsets(maxRetries))
-    val rdd = new KafkaRDD[K, V, U, T, R](
+    val rdd = KafkaRDD[K, V, U, T, R](
       context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
 
-    currentOffsets = untilOffsets
+    currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
     Some(rdd)
   }
 
diff --git a/external/kafka/src/test/scala/org/apache/spark/rdd/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/rdd/kafka/KafkaRDDSuite.scala
@@ -77,10 +77,13 @@ class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
     for {
       topicPartitions <- kc.getPartitions(topics).right.toOption
       from <- kc.getConsumerOffsets(groupId, topicPartitions).right.toOption.orElse(
-        kc.getEarliestLeaderOffsets(topicPartitions).right.toOption)
+        kc.getEarliestLeaderOffsets(topicPartitions).right.toOption.map { offs =>
+          offs.map(kv => kv._1 -> kv._2.offset)
+        }
+      )
       until <- kc.getLatestLeaderOffsets(topicPartitions).right.toOption
     } yield {
-      new KafkaRDD[String, String, StringDecoder, StringDecoder, String](
+      KafkaRDD[String, String, StringDecoder, StringDecoder, String](
         sc, kc.kafkaParams, from, until, mmd => s"${mmd.offset} ${mmd.message}")
     }
   }