Skip to content

Commit 827dbb1

Browse files
committed
HDFS-14370. Add exponential backoff to the edit log tailer to avoid spinning on empty edit tail requests. Contributed by Erik Krogen.
1 parent 0520f5c commit 827dbb1

File tree

5 files changed

+135
-11
lines changed

5 files changed

+135
-11
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
10041004
public static final int DFS_HA_LOGROLL_PERIOD_DEFAULT = 2 * 60; // 2m
10051005
public static final String DFS_HA_TAILEDITS_PERIOD_KEY = "dfs.ha.tail-edits.period";
10061006
public static final int DFS_HA_TAILEDITS_PERIOD_DEFAULT = 60; // 1m
1007+
public static final String DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_KEY = "dfs.ha.tail-edits.period.backoff-max";
1008+
public static final int DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_DEFAULT = 0; // disabled
10071009
public static final String DFS_HA_TAILEDITS_ALL_NAMESNODES_RETRY_KEY = "dfs.ha.tail-edits.namenode-retries";
10081010
public static final int DFS_HA_TAILEDITS_ALL_NAMESNODES_RETRY_DEFAULT = 3;
10091011
public static final String DFS_HA_TAILEDITS_INPROGRESS_KEY =

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,17 @@ public class EditLogTailer {
134134
private final ExecutorService rollEditsRpcExecutor;
135135

136136
/**
137-
* How often the Standby should check if there are new finalized segment(s)
138-
* available to be read from.
137+
* How often the tailer should check if there are new edit log entries
138+
* ready to be consumed. This is the initial delay before any backoff.
139139
*/
140140
private final long sleepTimeMs;
141+
/**
142+
* The maximum time the tailer should wait between checking for new edit log
143+
* entries. Exponential backoff will be applied when an edit log tail is
144+
* performed but no edits are available to be read. If this is less than or
145+
* equal to 0, backoff is disabled.
146+
*/
147+
private final long maxSleepTimeMs;
141148

142149
private final int nnCount;
143150
private NamenodeProtocol cachedActiveProxy = null;
@@ -206,6 +213,19 @@ public EditLogTailer(FSNamesystem namesystem, Configuration conf) {
206213
DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY,
207214
DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_DEFAULT,
208215
TimeUnit.SECONDS, TimeUnit.MILLISECONDS);
216+
long maxSleepTimeMsTemp = conf.getTimeDuration(
217+
DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_KEY,
218+
DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_DEFAULT,
219+
TimeUnit.SECONDS, TimeUnit.MILLISECONDS);
220+
if (maxSleepTimeMsTemp > 0 && maxSleepTimeMsTemp < sleepTimeMs) {
221+
LOG.warn("{} was configured to be {} ms, but this is less than {}."
222+
+ "Disabling backoff when tailing edit logs.",
223+
DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_KEY,
224+
maxSleepTimeMsTemp, DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY);
225+
maxSleepTimeMs = 0;
226+
} else {
227+
maxSleepTimeMs = maxSleepTimeMsTemp;
228+
}
209229

210230
rollEditsTimeoutMs = conf.getTimeDuration(
211231
DFSConfigKeys.DFS_HA_TAILEDITS_ROLLEDITS_TIMEOUT_KEY,
@@ -291,7 +311,7 @@ public Void run() throws Exception {
291311
}
292312

293313
@VisibleForTesting
294-
public void doTailEdits() throws IOException, InterruptedException {
314+
public long doTailEdits() throws IOException, InterruptedException {
295315
// Write lock needs to be interruptible here because the
296316
// transitionToActive RPC takes the write lock before calling
297317
// tailer.stop() -- so if we're not interruptible, it will
@@ -316,7 +336,7 @@ public void doTailEdits() throws IOException, InterruptedException {
316336
// edits file hasn't been started yet.
317337
LOG.warn("Edits tailer failed to find any streams. Will try again " +
318338
"later.", ioe);
319-
return;
339+
return 0;
320340
} finally {
321341
NameNode.getNameNodeMetrics().addEditLogFetchTime(
322342
Time.monotonicNow() - startTime);
@@ -347,6 +367,7 @@ public void doTailEdits() throws IOException, InterruptedException {
347367
lastLoadTimeMs = monotonicNow();
348368
}
349369
lastLoadedTxnId = image.getLastAppliedTxId();
370+
return editsLoaded;
350371
} finally {
351372
namesystem.writeUnlock();
352373
}
@@ -407,6 +428,11 @@ void triggerActiveLogRoll() {
407428
}
408429
}
409430

431+
@VisibleForTesting
432+
void sleep(long sleepTimeMillis) throws InterruptedException {
433+
Thread.sleep(sleepTimeMillis);
434+
}
435+
410436
/**
411437
* The thread which does the actual work of tailing edits journals and
412438
* applying the transactions to the FSNS.
@@ -435,7 +461,9 @@ public Object run() {
435461
}
436462

437463
private void doWork() {
464+
long currentSleepTimeMs = sleepTimeMs;
438465
while (shouldRun) {
466+
long editsTailed = 0;
439467
try {
440468
// There's no point in triggering a log roll if the Standby hasn't
441469
// read any more transactions since the last time a roll was
@@ -461,7 +489,7 @@ private void doWork() {
461489
try {
462490
NameNode.getNameNodeMetrics().addEditLogTailInterval(
463491
startTime - lastLoadTimeMs);
464-
doTailEdits();
492+
editsTailed = doTailEdits();
465493
} finally {
466494
namesystem.cpUnlock();
467495
NameNode.getNameNodeMetrics().addEditLogTailTime(
@@ -481,7 +509,17 @@ private void doWork() {
481509
}
482510

483511
try {
484-
Thread.sleep(sleepTimeMs);
512+
if (editsTailed == 0 && maxSleepTimeMs > 0) {
513+
// If no edits were tailed, apply exponential backoff
514+
// before tailing again. Double the current sleep time on each
515+
// empty response, but don't exceed the max. If the sleep time
516+
// was configured as 0, start the backoff at 1 ms.
517+
currentSleepTimeMs = Math.min(maxSleepTimeMs,
518+
(currentSleepTimeMs == 0 ? 1 : currentSleepTimeMs) * 2);
519+
} else {
520+
currentSleepTimeMs = sleepTimeMs; // reset to initial sleep time
521+
}
522+
EditLogTailer.this.sleep(currentSleepTimeMs);
485523
} catch (InterruptedException e) {
486524
LOG.warn("Edit log tailer interrupted: {}", e.getMessage());
487525
}

hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,9 +1662,25 @@
16621662
<name>dfs.ha.tail-edits.period</name>
16631663
<value>60s</value>
16641664
<description>
1665-
How often, in seconds, the StandbyNode should check for new
1666-
finalized log segments in the shared edits log.
1667-
Support multiple time unit suffix(case insensitive), as described
1665+
How often, the StandbyNode and ObserverNode should check if there are new
1666+
edit log entries ready to be consumed. This is the minimum period between
1667+
checking; exponential backoff will be applied if no edits are found and
1668+
dfs.ha.tail-edits.period.backoff-max is configured. By default, no
1669+
backoff is applied.
1670+
Supports multiple time unit suffix (case insensitive), as described
1671+
in dfs.heartbeat.interval.
1672+
</description>
1673+
</property>
1674+
1675+
<property>
1676+
<name>dfs.ha.tail-edits.period.backoff-max</name>
1677+
<value>0</value>
1678+
<description>
1679+
The maximum time the tailer should wait between checking for new edit log
1680+
entries. Exponential backoff will be applied when an edit log tail is
1681+
performed but no edits are available to be read. Values less than or
1682+
equal to zero disable backoff entirely; this is the default behavior.
1683+
Supports multiple time unit suffix (case insensitive), as described
16681684
in dfs.heartbeat.interval.
16691685
</description>
16701686
</property>

hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/ObserverNameNode.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,33 @@ few configurations to your **hdfs-site.xml**:
140140
If too large, RPC time will increase as client requests will wait
141141
longer in the RPC queue before Observer tails edit logs and catches
142142
up the latest state of Active. The default value is 1min. It is
143-
**highly recommend** to configure this to a much lower value.
143+
**highly recommend** to configure this to a much lower value. It is also
144+
recommended to configure backoff to be enabled when using low values; please
145+
see below.
144146

145147
<property>
146148
<name>dfs.ha.tail-edits.period</name>
147149
<value>0ms</value>
148150
</property>
149151

152+
* **dfs.ha.tail-edits.period.backoff-max** - whether the Standby/Observer
153+
NameNodes should perform backoff when tailing edits.
154+
155+
This determines the behavior of a Standby/Observer when it attempts to
156+
tail edits from the JournalNodes and finds no edits available. This is a
157+
common situation when the edit tailing period is very low, but the cluster
158+
is not heavily loaded. Without this configuration, such a situation will
159+
cause high utilization on the Standby/Observer as it constantly attempts to
160+
read edits even though there are none available. With this configuration
161+
enabled, exponential backoff will be performed when an edit tail attempt
162+
returns 0 edits. This configuration specifies the maximum time to wait
163+
between edit tailing attempts.
164+
165+
<property>
166+
<name>dfs.ha.tail-edits.period</name>
167+
<value>10s</value>
168+
</property>
169+
150170
* **dfs.journalnode.edit-cache-size.bytes** - the in-memory cache size,
151171
in bytes, on the JournalNodes.
152172

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,22 @@
2020
import static org.junit.Assert.assertEquals;
2121
import static org.junit.Assert.assertTrue;
2222
import static org.junit.Assert.fail;
23+
import static org.mockito.Mockito.mock;
2324
import static org.mockito.Mockito.when;
2425

2526
import java.io.File;
2627
import java.io.IOException;
2728
import java.net.BindException;
2829
import java.net.URI;
2930
import java.util.ArrayList;
31+
import java.util.Arrays;
3032
import java.util.Collection;
33+
import java.util.List;
34+
import java.util.Queue;
3135
import java.util.Random;
3236
import java.util.concurrent.Callable;
37+
import java.util.concurrent.ConcurrentLinkedQueue;
38+
import java.util.concurrent.TimeUnit;
3339
import java.util.concurrent.TimeoutException;
3440
import java.util.concurrent.atomic.AtomicInteger;
3541

@@ -42,8 +48,10 @@
4248
import org.apache.hadoop.hdfs.HdfsConfiguration;
4349
import org.apache.hadoop.hdfs.MiniDFSCluster;
4450
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
51+
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
4552
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
4653
import org.apache.hadoop.hdfs.server.namenode.FSImage;
54+
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
4755
import org.apache.hadoop.hdfs.server.namenode.NNStorage;
4856
import org.apache.hadoop.hdfs.server.namenode.NameNode;
4957
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
@@ -150,7 +158,47 @@ public void testTailer() throws IOException, InterruptedException,
150158
cluster.shutdown();
151159
}
152160
}
153-
161+
162+
@Test
163+
public void testTailerBackoff() throws Exception {
164+
Configuration conf = new Configuration();
165+
NameNode.initMetrics(conf, HdfsServerConstants.NamenodeRole.NAMENODE);
166+
conf.setTimeDuration(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY,
167+
1, TimeUnit.MILLISECONDS);
168+
conf.setTimeDuration(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_BACKOFF_MAX_KEY,
169+
10, TimeUnit.MILLISECONDS);
170+
FSNamesystem mockNamesystem = mock(FSNamesystem.class);
171+
FSImage mockImage = mock(FSImage.class);
172+
NNStorage mockStorage = mock(NNStorage.class);
173+
when(mockNamesystem.getFSImage()).thenReturn(mockImage);
174+
when(mockImage.getStorage()).thenReturn(mockStorage);
175+
final Queue<Long> sleepDurations = new ConcurrentLinkedQueue<>();
176+
final int zeroEditCount = 5;
177+
final AtomicInteger tailEditsCallCount = new AtomicInteger(0);
178+
EditLogTailer tailer = new EditLogTailer(mockNamesystem, conf) {
179+
@Override
180+
void sleep(long sleepTimeMs) {
181+
if (sleepDurations.size() <= zeroEditCount) {
182+
sleepDurations.add(sleepTimeMs);
183+
}
184+
}
185+
186+
@Override
187+
public long doTailEdits() {
188+
return tailEditsCallCount.getAndIncrement() < zeroEditCount ? 0 : 1;
189+
}
190+
};
191+
tailer.start();
192+
try {
193+
GenericTestUtils.waitFor(
194+
() -> sleepDurations.size() > zeroEditCount, 50, 10000);
195+
} finally {
196+
tailer.stop();
197+
}
198+
List<Long> expectedDurations = Arrays.asList(2L, 4L, 8L, 10L, 10L, 1L);
199+
assertEquals(expectedDurations, new ArrayList<>(sleepDurations));
200+
}
201+
154202
@Test
155203
public void testNN0TriggersLogRolls() throws Exception {
156204
testStandbyTriggersLogRolls(0);

0 commit comments

Comments
 (0)