Skip to content

Commit b039f75

Browse files
committed
HDDS-1682. TestEventWatcher.testMetrics is flaky
Closes #962.
1 parent 61ec03c commit b039f75

File tree

2 files changed

+23
-12
lines changed

2 files changed

+23
-12
lines changed

hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/server/events/EventWatcher.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,15 @@ private synchronized void handleStartEvent(TIMEOUT_PAYLOAD payload,
143143
protected synchronized void handleCompletion(COMPLETION_PAYLOAD
144144
completionPayload, EventPublisher publisher) throws
145145
LeaseNotFoundException {
146-
metrics.incrementCompletedEvents();
147146
long id = completionPayload.getId();
148147
leaseManager.release(id);
149148
TIMEOUT_PAYLOAD payload = trackedEventsByID.remove(id);
150-
trackedEvents.remove(payload);
151-
long originalTime = startTrackingTimes.remove(id);
152-
metrics.updateFinishingTime(System.currentTimeMillis() - originalTime);
153-
onFinished(publisher, payload);
149+
if (trackedEvents.remove(payload)) {
150+
metrics.incrementCompletedEvents();
151+
long originalTime = startTrackingTimes.remove(id);
152+
metrics.updateFinishingTime(System.currentTimeMillis() - originalTime);
153+
onFinished(publisher, payload);
154+
}
154155
}
155156

156157
private synchronized void handleTimeout(EventPublisher publisher,

hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/server/events/TestEventWatcher.java

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,22 +179,32 @@ public void testMetrics() throws InterruptedException {
179179

180180
queue.fireEvent(REPLICATION_COMPLETED, event1Completed);
181181

182-
Thread.sleep(2200L);
182+
//lease manager timeout = 2000L
183+
Thread.sleep(3 * 2000L);
184+
185+
queue.processAll(2000L);
183186

184187
//until now: 3 in-progress activities are tracked with three
185188
// UnderreplicatedEvents. The first one is completed, the remaining two
186-
// are timed out (as the timeout -- defined in the leasmanager -- is 2000ms.
189+
// are timed out (as the timeout -- defined in the lease manager -- is
190+
// 2000ms).
187191

188192
EventWatcherMetrics metrics = replicationWatcher.getMetrics();
189193

190194
//3 events are received
191195
Assert.assertEquals(3, metrics.getTrackedEvents().value());
192196

193-
//one is finished. doesn't need to be resent
194-
Assert.assertEquals(1, metrics.getCompletedEvents().value());
195-
196-
//Other two are timed out and resent
197-
Assert.assertEquals(2, metrics.getTimedOutEvents().value());
197+
//completed + timed out = all messages
198+
Assert.assertEquals(
199+
"number of timed out and completed messages should be the same as the"
200+
+ " all messages",
201+
metrics.getTrackedEvents().value(),
202+
metrics.getCompletedEvents().value() + metrics.getTimedOutEvents()
203+
.value());
204+
205+
//_at least_ two are timed out.
206+
Assert.assertTrue("At least two events should be timed out.",
207+
metrics.getTimedOutEvents().value() >= 2);
198208

199209
DefaultMetricsSystem.shutdown();
200210
}

0 commit comments

Comments
 (0)