Skip to content

Commit ef1c962

Browse files
jmakesJacob Maes
authored andcommitted
SAMZA-1539: KafkaProducer potential hang on close() when task.drop.pr…
…oducer.errors==true Author: Jacob Maes <[email protected]> Reviewers: Boris Shkolnik <[email protected]> Closes apache#390 from jmakes/samza-1539
1 parent 29ecae8 commit ef1c962

File tree

2 files changed

+103
-71
lines changed

2 files changed

+103
-71
lines changed

samza-kafka/src/main/scala/org/apache/samza/system/kafka/KafkaSystemProducer.scala

Lines changed: 75 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import org.apache.kafka.clients.producer.Producer
2828
import org.apache.kafka.clients.producer.ProducerRecord
2929
import org.apache.kafka.clients.producer.RecordMetadata
3030
import org.apache.kafka.common.PartitionInfo
31-
import org.apache.kafka.common.errors.SerializationException
3231
import org.apache.samza.system.OutgoingMessageEnvelope
3332
import org.apache.samza.system.SystemProducer
3433
import org.apache.samza.system.SystemProducerException
@@ -46,32 +45,30 @@ class KafkaSystemProducer(systemName: String,
4645

4746
// Represents a fatal error that caused the producer to close.
4847
val fatalException: AtomicReference[SystemProducerException] = new AtomicReference[SystemProducerException]()
49-
@volatile var producer: Producer[Array[Byte], Array[Byte]] = null
50-
val producerLock: Object = new Object
48+
val producerRef: AtomicReference[Producer[Array[Byte], Array[Byte]]] = new AtomicReference[Producer[Array[Byte], Array[Byte]]]()
49+
val producerCreationLock: Object = new Object
50+
@volatile var stopped = false
5151

5252
def start(): Unit = {
53-
producer = getProducer()
53+
producerRef.set(getProducer())
5454
}
5555

5656
def stop() {
5757
info("Stopping producer for system: " + this.systemName)
5858

59-
// stop() should not happen often so no need to optimize locking
60-
producerLock.synchronized {
61-
try {
62-
if (producer != null) {
63-
producer.close // Also performs the equivalent of a flush()
64-
}
59+
stopped = true
60+
val currentProducer = producerRef.getAndSet(null)
61+
try {
62+
if (currentProducer != null) {
63+
currentProducer.close // Also performs the equivalent of a flush()
64+
}
6565

66-
val exception = fatalException.get()
67-
if (exception != null) {
68-
error("Observed an earlier send() error while closing producer", exception)
69-
}
70-
} catch {
71-
case e: Exception => error("Error while closing producer for system: " + systemName, e)
72-
} finally {
73-
producer = null
66+
val exception = fatalException.get()
67+
if (exception != null) {
68+
error("Observed an earlier send() error while closing producer", exception)
7469
}
70+
} catch {
71+
case e: Exception => error("Error while closing producer for system: " + systemName, e)
7572
}
7673
}
7774

@@ -82,7 +79,7 @@ class KafkaSystemProducer(systemName: String,
8279
trace("Enqueuing message: %s, %s." format (source, envelope))
8380

8481
val topicName = envelope.getSystemStream.getStream
85-
if (topicName == null || topicName == "") {
82+
if (topicName == null || topicName.isEmpty) {
8683
throw new IllegalArgumentException("Invalid system stream: " + envelope.getSystemStream)
8784
}
8885

@@ -92,10 +89,7 @@ class KafkaSystemProducer(systemName: String,
9289
throw new SystemProducerException("Producer was unable to recover from previous exception.", globalProducerException)
9390
}
9491

95-
val currentProducer = producer
96-
if (currentProducer == null) {
97-
throw new SystemProducerException("Kafka producer is null.")
98-
}
92+
val currentProducer = getOrCreateCurrentProducer
9993

10094
// Java-based Kafka producer API requires an "Integer" type partitionKey and does not allow custom overriding of Partitioners
10195
// Any kind of custom partitioning has to be done on the client-side
@@ -115,7 +109,7 @@ class KafkaSystemProducer(systemName: String,
115109
val producerException = new SystemProducerException("Failed to send message for Source: %s on System:%s Topic:%s Partition:%s"
116110
.format(source, systemName, topicName, partitionKey), exception)
117111

118-
handleSendException(currentProducer, producerException, true)
112+
handleFatalSendException(currentProducer, producerException)
119113
}
120114
}
121115
})
@@ -125,18 +119,25 @@ class KafkaSystemProducer(systemName: String,
125119
val producerException = new SystemProducerException("Failed to send message for Source: %s on System:%s Topic:%s Partition:%s"
126120
.format(source, systemName, topicName, partitionKey), originalException)
127121

128-
handleSendException(currentProducer, producerException, isFatalException(originalException))
122+
metrics.sendFailed.inc
123+
error("Got a synchronous error from Kafka producer.", producerException)
124+
// Synchronous exceptions are always recoverable so propagate it up and let the user decide
129125
throw producerException
130126
}
131127
}
132128

133-
134129
def flush(source: String) {
135130
updateTimer(metrics.flushNs) {
136131
metrics.flushes.inc
137132

138-
val currentProducer = producer
133+
val currentProducer = producerRef.get()
139134
if (currentProducer == null) {
135+
if (dropProducerExceptions) {
136+
// No producer to flush, but we're ignoring exceptions so just return.
137+
warn("Skipping flush because the Kafka producer is null.")
138+
metrics.flushFailed.inc
139+
return
140+
}
140141
throw new SystemProducerException("Kafka producer is null.")
141142
}
142143

@@ -162,7 +163,14 @@ class KafkaSystemProducer(systemName: String,
162163
}
163164

164165

165-
private def handleSendException(currentProducer: Producer[Array[Byte], Array[Byte]], producerException: SystemProducerException, isFatalException: Boolean) = {
166+
/**
167+
* Handles a fatal exception by closing the producer and either recreating it or storing the exception
168+
* to rethrow later, depending on the value of dropProducerExceptions.
169+
*
170+
* @param currentProducer the current producer for which the exception occurred. Must not be null.
171+
* @param producerException the exception to handle.
172+
*/
173+
private def handleFatalSendException(currentProducer: Producer[Array[Byte], Array[Byte]], producerException: SystemProducerException): Unit = {
166174
metrics.sendFailed.inc
167175
error(producerException)
168176
// The SystemProducer API is synchronous, so there's no way for us to guarantee that an exception will
@@ -172,49 +180,56 @@ class KafkaSystemProducer(systemName: String,
172180
if (dropProducerExceptions) {
173181
warn("Ignoring producer exception. All messages in the failed producer request will be dropped!")
174182

175-
if (isFatalException) {
176-
producerLock.synchronized {
177-
// Prevent each callback from recreating producer for the same failure.
178-
if (currentProducer == producer) {
179-
info("Creating a new producer for system %s." format systemName)
180-
try {
181-
currentProducer.close(0, TimeUnit.MILLISECONDS)
182-
} catch {
183-
case exception: Exception => error("Exception while closing producer.", exception)
184-
}
185-
producer = getProducer()
186-
}
187-
}
188-
}
189-
} else {
190-
// If there is an exception in the callback, it means that the Kafka producer has exhausted the max-retries
191-
// Close producer to ensure messages queued in-flight are not sent and hence, avoid re-ordering
192-
// This works because there is only 1 callback thread and no sends can complete until the callback returns.
193-
if (isFatalException) {
194-
fatalException.compareAndSet(null, producerException)
183+
// Prevent each callback from closing and nulling producer for the same failure.
184+
if (currentProducer == producerRef.get()) {
185+
info("Closing producer for system %s." format systemName)
195186
try {
187+
// send()s can get ProducerClosedException if the producer is stopped after they get the currentProducer
188+
// reference but before producer.send() returns. That's ONLY ok when dropProducerExceptions is true.
189+
// Also, when producer.close(0) is invoked on the Kafka IO thread, when it returns there will be no more
190+
// messages sent over the wire. This is key to ensuring no out-of-order messages as a result of recreating
191+
// the producer.
196192
currentProducer.close(0, TimeUnit.MILLISECONDS)
197193
} catch {
198194
case exception: Exception => error("Exception while closing producer.", exception)
199195
}
196+
producerRef.compareAndSet(currentProducer, null)
197+
}
198+
} else {
199+
// If there is an exception in the callback, it means that the Kafka producer has exhausted the max-retries
200+
// Close producer to ensure messages queued in-flight are not sent and hence, avoid re-ordering
201+
// This works because there is only 1 IO thread and no IO can be done until the callback returns.
202+
// Do not create a new producer here! It cannot be done without data loss for all concurrency modes.
203+
fatalException.compareAndSet(null, producerException)
204+
try {
205+
currentProducer.close(0, TimeUnit.MILLISECONDS)
206+
} catch {
207+
case exception: Exception => error("Exception while closing producer.", exception)
200208
}
201209
}
202210
}
203211

204212
/**
205-
* A fatal exception is one that corrupts the producer or otherwise makes it unusable.
206-
* We want to handle non-fatal exceptions differently because they can often be handled by the user
207-
* and that's preferable because it gives users that drop exceptions a way to do that with less
208-
* data loss (no collateral damage from batches of messages getting dropped)
209-
*
210-
* @param exception the exception to check
211-
* @return true if the exception is unrecoverable.
213+
* @return the current producer. Never returns null.
212214
*/
213-
private def isFatalException(exception: Exception): Boolean = {
214-
exception match {
215-
case _: SerializationException => false
216-
case _: ClassCastException => false
217-
case _ => true
215+
private def getOrCreateCurrentProducer = {
216+
var currentProducer = producerRef.get
217+
218+
if (currentProducer == null) {
219+
if (dropProducerExceptions && !stopped) {
220+
// Note: While this lock prevents others from creating a new producer, they could still set it to null.
221+
producerCreationLock.synchronized {
222+
currentProducer = producerRef.get
223+
if (currentProducer == null) {
224+
currentProducer = getProducer()
225+
producerRef.set(currentProducer)
226+
}
227+
}
228+
// Invariant: currentProducer must not be null at this point.
229+
} else {
230+
throw new SystemProducerException("Kafka producer is null.")
231+
}
218232
}
233+
currentProducer
219234
}
220235
}

samza-kafka/src/test/scala/org/apache/samza/system/kafka/TestKafkaSystemProducer.scala

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class TestKafkaSystemProducer {
4141
systemProducer.register("test")
4242
systemProducer.start
4343
systemProducer.send("test", someMessage)
44-
assertEquals(1, systemProducer.producer.asInstanceOf[MockProducer[Array[Byte], Array[Byte]]].history().size())
44+
assertEquals(1, systemProducer.producerRef.get().asInstanceOf[MockProducer[Array[Byte], Array[Byte]]].history().size())
4545
systemProducer.stop
4646
}
4747

@@ -207,7 +207,7 @@ class TestKafkaSystemProducer {
207207
val mockProducer = new MockKafkaProducer(1, "test", 1)
208208
val producer = new KafkaSystemProducer(systemName = "test",
209209
getProducer = () => {
210-
mockProducer.open() // A new producer is never closed
210+
mockProducer.open() // A new producer would not already be closed, so reset it.
211211
mockProducer
212212
},
213213
metrics = producerMetrics)
@@ -219,6 +219,7 @@ class TestKafkaSystemProducer {
219219
mockProducer.setErrorNext(true, true, new RecordTooLargeException())
220220
producer.send("test", msg3) // Callback exception
221221
assertTrue(mockProducer.isClosed)
222+
assertNotNull(producer.producerRef.get())
222223
assertEquals("Should NOT have created a new producer", 1, mockProducer.getOpenCount)
223224

224225
val senderException = intercept[SystemProducerException] {
@@ -269,7 +270,7 @@ class TestKafkaSystemProducer {
269270
val mockProducer = new MockKafkaProducer(1, "test", 1)
270271
val producer = new KafkaSystemProducer(systemName = "test",
271272
getProducer = () => {
272-
mockProducer.open() // A new producer is never closed
273+
mockProducer.open() // A new producer would not already be closed, so reset it.
273274
mockProducer
274275
},
275276
metrics = producerMetrics)
@@ -286,6 +287,7 @@ class TestKafkaSystemProducer {
286287
mockProducer.setErrorNext(true, true, new RecordTooLargeException())
287288
producer.send("test1", msg3) // Callback exception
288289
assertTrue(mockProducer.isClosed)
290+
assertNotNull(producer.producerRef.get())
289291
assertEquals("Should NOT have created a new producer", 1, mockProducer.getOpenCount)
290292

291293
// Subsequent sends
@@ -343,7 +345,7 @@ class TestKafkaSystemProducer {
343345
val mockProducer = new MockKafkaProducer(1, "test", 1)
344346
val producer = new KafkaSystemProducer(systemName = "test",
345347
getProducer = () => {
346-
mockProducer.open() // A new producer is never closed
348+
mockProducer.open() // A new producer would not already be closed, so reset it.
347349
mockProducer
348350
},
349351
metrics = producerMetrics)
@@ -359,6 +361,7 @@ class TestKafkaSystemProducer {
359361
}
360362
assertTrue(sendException.getCause.isInstanceOf[SerializationException])
361363
assertFalse(mockProducer.isClosed)
364+
assertNotNull(producer.producerRef.get())
362365
assertEquals("Should NOT have created a new producer", 1, mockProducer.getOpenCount)
363366

364367
producer.send("test1", msg3) // Should be able to resend msg3
@@ -406,7 +409,7 @@ class TestKafkaSystemProducer {
406409
val mockProducer = new MockKafkaProducer(1, "test", 1)
407410
val producer = new KafkaSystemProducer(systemName = "test",
408411
getProducer = () => {
409-
mockProducer.open() // A new producer is never closed
412+
mockProducer.open() // A new producer would not already be closed, so reset it.
410413
mockProducer
411414
},
412415
metrics = producerMetrics,
@@ -418,13 +421,18 @@ class TestKafkaSystemProducer {
418421
producer.send("test", msg2)
419422
mockProducer.setErrorNext(true, true, new RecordTooLargeException())
420423
producer.send("test", msg3) // Callback exception
421-
assertFalse(mockProducer.isClosed)
422-
assertEquals("Should have created a new producer", 2, mockProducer.getOpenCount)
424+
assertTrue(mockProducer.isClosed)
425+
assertNull(producer.producerRef.get())
426+
assertEquals("Should not have created a new producer", 1, mockProducer.getOpenCount)
423427

424428
producer.send("test", msg4) // Should succeed because the producer recovered.
429+
assertFalse(mockProducer.isClosed)
430+
assertNotNull(producer.producerRef.get())
431+
assertEquals("Should have created a new producer", 2, mockProducer.getOpenCount)
425432
producer.flush("test") // Should not throw
426433

427434
producer.send("test", msg5) // Should be able to send again after flush
435+
assertEquals("Should not have created a new producer", 2, mockProducer.getOpenCount)
428436
producer.flush("test")
429437

430438
assertEquals(4, mockProducer.getMsgsSent) // every message except the one with the error should get sent
@@ -456,7 +464,7 @@ class TestKafkaSystemProducer {
456464
val mockProducer = new MockKafkaProducer(1, "test", 1)
457465
val producer = new KafkaSystemProducer(systemName = "test",
458466
getProducer = () => {
459-
mockProducer.open() // A new producer is never closed
467+
mockProducer.open() // A new producer would not already be closed, so reset it.
460468
mockProducer
461469
},
462470
metrics = producerMetrics,
@@ -473,12 +481,17 @@ class TestKafkaSystemProducer {
473481
// Inject error for next send
474482
mockProducer.setErrorNext(true, true, new RecordTooLargeException())
475483
producer.send("test1", msg3) // Callback exception
476-
assertFalse(mockProducer.isClosed)
477-
assertEquals("Should have created a new producer", 2, mockProducer.getOpenCount)
484+
assertTrue(mockProducer.isClosed)
485+
assertNull(producer.producerRef.get())
486+
assertEquals("Should not have created a new producer", 1, mockProducer.getOpenCount)
478487

479488
// Subsequent sends
480489
producer.send("test1", msg4) // Should succeed because the producer recovered.
490+
assertFalse(mockProducer.isClosed)
491+
assertEquals("Should have created a new producer", 2, mockProducer.getOpenCount)
492+
assertNotNull(producer.producerRef.get())
481493
producer.send("test2", msg5) // Second source should also not have any error.
494+
assertEquals("Should not have created a new producer", 2, mockProducer.getOpenCount)
482495

483496
// Flushes
484497
producer.flush("test2") // Should not throw for test2
@@ -503,7 +516,7 @@ class TestKafkaSystemProducer {
503516
val mockProducer = new MockKafkaProducer(1, "test", 1)
504517
val producer = new KafkaSystemProducer(systemName = "test",
505518
getProducer = () => {
506-
mockProducer.open() // A new producer is never closed
519+
mockProducer.open() // A new producer would not already be closed, so reset it.
507520
mockProducer
508521
},
509522
metrics = producerMetrics,
@@ -520,9 +533,13 @@ class TestKafkaSystemProducer {
520533
}
521534
assertTrue(sendException.getCause.isInstanceOf[SerializationException])
522535
assertFalse(mockProducer.isClosed)
536+
assertNotNull(producer.producerRef.get()) // Synchronous error; producer should not be recreated
523537
assertEquals("Should NOT have created a new producer", 1, mockProducer.getOpenCount)
524538

525539
producer.send("test1", msg3) // Should be able to resend msg3
540+
assertFalse(mockProducer.isClosed)
541+
assertEquals("Should NOT have created a new producer", 1, mockProducer.getOpenCount)
542+
assertNotNull(producer.producerRef.get())
526543
producer.send("test2", msg4) // Second source should not be affected
527544

528545
producer.flush("test1") // Flush should be unaffected

0 commit comments

Comments
 (0)