Skip to content

Commit e25a5c2

Browse files
steveloughranGabor Bota
authored andcommitted
HADOOP-16499. S3A retry policy to be exponential (#1246). Contributed by Steve Loughran.
1 parent 43a91f8 commit e25a5c2

File tree

12 files changed

+124
-46
lines changed

12 files changed

+124
-46
lines changed

hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,7 +1660,7 @@
16601660

16611661
<property>
16621662
<name>fs.s3a.retry.limit</name>
1663-
<value>${fs.s3a.attempts.maximum}</value>
1663+
<value>7</value>
16641664
<description>
16651665
Number of times to retry any repeatable S3 client request on failure,
16661666
excluding throttling requests.
@@ -1671,7 +1671,7 @@
16711671
<name>fs.s3a.retry.interval</name>
16721672
<value>500ms</value>
16731673
<description>
1674-
Interval between attempts to retry operations for any reason other
1674+
Initial retry interval when retrying operations for any reason other
16751675
than S3 throttle errors.
16761676
</description>
16771677
</property>

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ private Constants() {
635635
/**
636636
* Default retry limit: {@value}.
637637
*/
638-
public static final int RETRY_LIMIT_DEFAULT = DEFAULT_MAX_ERROR_RETRIES;
638+
public static final int RETRY_LIMIT_DEFAULT = 7;
639639

640640
/**
641641
* Interval between retry attempts.: {@value}.

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public S3ARetryPolicy(Configuration conf) {
109109
Preconditions.checkArgument(conf != null, "Null configuration");
110110

111111
// base policy from configuration
112-
fixedRetries = retryUpToMaximumCountWithFixedSleep(
112+
fixedRetries = exponentialBackoffRetry(
113113
conf.getInt(RETRY_LIMIT, RETRY_LIMIT_DEFAULT),
114114
conf.getTimeDuration(RETRY_INTERVAL,
115115
RETRY_INTERVAL_DEFAULT,

hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,7 @@ is unrecoverable; it's the generic "No" response. Very rarely it
10181018
does recover, which is why it is in this category, rather than that
10191019
of unrecoverable failures.
10201020

1021-
These failures will be retried with a fixed sleep interval set in
1021+
These failures will be retried with an exponential sleep interval set in
10221022
`fs.s3a.retry.interval`, up to the limit set in `fs.s3a.retry.limit`.
10231023

10241024

@@ -1033,7 +1033,7 @@ after the request was processed by S3.
10331033
* "No response from Server" (443, 444) HTTP responses.
10341034
* Any other AWS client, service or S3 exception.
10351035

1036-
These failures will be retried with a fixed sleep interval set in
1036+
These failures will be retried with an exponential sleep interval set in
10371037
`fs.s3a.retry.interval`, up to the limit set in `fs.s3a.retry.limit`.
10381038

10391039
*Important*: DELETE is considered idempotent, hence: `FileSystem.delete()`

hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,17 +1233,20 @@ The number of retries and interval between each retry can be configured:
12331233

12341234
```xml
12351235
<property>
1236-
<name>fs.s3a.attempts.maximum</name>
1237-
<value>20</value>
1238-
<description>How many times we should retry commands on transient errors,
1239-
excluding throttling errors.</description>
1236+
<name>fs.s3a.retry.limit</name>
1237+
<value>7</value>
1238+
<description>
1239+
Number of times to retry any repeatable S3 client request on failure,
1240+
excluding throttling requests.
1241+
</description>
12401242
</property>
12411243

12421244
<property>
12431245
<name>fs.s3a.retry.interval</name>
12441246
<value>500ms</value>
12451247
<description>
1246-
Interval between retry attempts.
1248+
Initial retry interval when retrying operations for any reason other
1249+
than S3 throttle errors.
12471250
</description>
12481251
</property>
12491252
```

hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AConfiguration.java

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,7 @@ public void testEndpoint() throws Exception {
123123

124124
@Test
125125
public void testProxyConnection() throws Exception {
126-
conf = new Configuration();
127-
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
126+
useFailFastConfiguration();
128127
conf.set(Constants.PROXY_HOST, "127.0.0.1");
129128
conf.setInt(Constants.PROXY_PORT, 1);
130129
String proxy =
@@ -133,6 +132,16 @@ public void testProxyConnection() throws Exception {
133132
conf, "when using proxy " + proxy);
134133
}
135134

135+
/**
136+
* Create a configuration designed to fail fast on network problems.
137+
*/
138+
protected void useFailFastConfiguration() {
139+
conf = new Configuration();
140+
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
141+
conf.setInt(Constants.RETRY_LIMIT, 2);
142+
conf.set(RETRY_INTERVAL, "100ms");
143+
}
144+
136145
/**
137146
* Expect a filesystem to not be created from a configuration
138147
* @return the exception intercepted
@@ -153,9 +162,8 @@ private <E extends Throwable> E expectFSCreateFailure(
153162

154163
@Test
155164
public void testProxyPortWithoutHost() throws Exception {
156-
conf = new Configuration();
165+
useFailFastConfiguration();
157166
conf.unset(Constants.PROXY_HOST);
158-
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
159167
conf.setInt(Constants.PROXY_PORT, 1);
160168
IllegalArgumentException e = expectFSCreateFailure(
161169
IllegalArgumentException.class,
@@ -169,9 +177,8 @@ public void testProxyPortWithoutHost() throws Exception {
169177

170178
@Test
171179
public void testAutomaticProxyPortSelection() throws Exception {
172-
conf = new Configuration();
180+
useFailFastConfiguration();
173181
conf.unset(Constants.PROXY_PORT);
174-
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
175182
conf.set(Constants.PROXY_HOST, "127.0.0.1");
176183
conf.set(Constants.SECURE_CONNECTIONS, "true");
177184
expectFSCreateFailure(AWSClientIOException.class,
@@ -183,8 +190,7 @@ public void testAutomaticProxyPortSelection() throws Exception {
183190

184191
@Test
185192
public void testUsernameInconsistentWithPassword() throws Exception {
186-
conf = new Configuration();
187-
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
193+
useFailFastConfiguration();
188194
conf.set(Constants.PROXY_HOST, "127.0.0.1");
189195
conf.setInt(Constants.PROXY_PORT, 1);
190196
conf.set(Constants.PROXY_USERNAME, "user");
@@ -204,8 +210,7 @@ private void assertIsProxyUsernameError(final IllegalArgumentException e) {
204210

205211
@Test
206212
public void testUsernameInconsistentWithPassword2() throws Exception {
207-
conf = new Configuration();
208-
conf.setInt(Constants.MAX_ERROR_RETRIES, 2);
213+
useFailFastConfiguration();
209214
conf.set(Constants.PROXY_HOST, "127.0.0.1");
210215
conf.setInt(Constants.PROXY_PORT, 1);
211216
conf.set(Constants.PROXY_PASSWORD, "password");

hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ADelayedFNF.java

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818

1919
package org.apache.hadoop.fs.s3a;
2020

21+
import org.apache.hadoop.conf.Configuration;
2122
import org.apache.hadoop.fs.FSDataInputStream;
22-
import org.apache.hadoop.fs.FileSystem;
2323
import org.apache.hadoop.fs.Path;
2424
import org.apache.hadoop.fs.contract.ContractTestUtils;
2525
import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
@@ -30,14 +30,35 @@
3030
import org.junit.Test;
3131

3232
import java.io.FileNotFoundException;
33-
import java.util.concurrent.Callable;
33+
34+
import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_MODE;
35+
import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_SOURCE;
36+
import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
37+
import static org.apache.hadoop.fs.s3a.Constants.RETRY_INTERVAL;
38+
import static org.apache.hadoop.fs.s3a.Constants.RETRY_LIMIT;
39+
import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
3440

3541
/**
3642
* Tests behavior of a FileNotFound error that happens after open(), i.e. on
3743
* the first read.
3844
*/
3945
public class ITestS3ADelayedFNF extends AbstractS3ATestBase {
4046

47+
@Override
48+
protected Configuration createConfiguration() {
49+
Configuration conf = super.createConfiguration();
50+
// reduce retry limit so FileNotFoundException cases timeout faster,
51+
// speeding up the tests
52+
removeBaseAndBucketOverrides(conf,
53+
CHANGE_DETECT_SOURCE,
54+
CHANGE_DETECT_MODE,
55+
RETRY_LIMIT,
56+
RETRY_INTERVAL,
57+
METADATASTORE_AUTHORITATIVE);
58+
conf.setInt(RETRY_LIMIT, 2);
59+
conf.set(RETRY_INTERVAL, "1ms");
60+
return conf;
61+
}
4162

4263
/**
4364
* See debugging documentation
@@ -46,9 +67,9 @@ public class ITestS3ADelayedFNF extends AbstractS3ATestBase {
4667
*/
4768
@Test
4869
public void testNotFoundFirstRead() throws Exception {
49-
FileSystem fs = getFileSystem();
70+
S3AFileSystem fs = getFileSystem();
5071
ChangeDetectionPolicy changeDetectionPolicy =
51-
((S3AFileSystem) fs).getChangeDetectionPolicy();
72+
fs.getChangeDetectionPolicy();
5273
Assume.assumeFalse("FNF not expected when using a bucket with"
5374
+ " object versioning",
5475
changeDetectionPolicy.getSource() == Source.VersionId);
@@ -61,12 +82,7 @@ public void testNotFoundFirstRead() throws Exception {
6182

6283
// This should fail since we deleted after the open.
6384
LambdaTestUtils.intercept(FileNotFoundException.class,
64-
new Callable<Integer>() {
65-
@Override
66-
public Integer call() throws Exception {
67-
return in.read();
68-
}
69-
});
85+
() -> in.read());
7086
}
7187

7288
}

hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@
2121
import org.apache.hadoop.conf.Configuration;
2222
import org.apache.hadoop.fs.FileStatus;
2323
import org.apache.hadoop.fs.Path;
24-
import org.apache.hadoop.fs.contract.AbstractFSContract;
2524
import org.apache.hadoop.fs.contract.ContractTestUtils;
26-
import org.apache.hadoop.fs.contract.s3a.S3AContract;
2725
import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
2826
import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
2927
import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
@@ -40,6 +38,7 @@
4038
import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile;
4139
import static org.apache.hadoop.fs.s3a.Constants.*;
4240
import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
41+
import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
4342
import static org.apache.hadoop.test.LambdaTestUtils.eventually;
4443
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
4544

@@ -53,16 +52,40 @@
5352
*/
5453
public class ITestS3AInconsistency extends AbstractS3ATestBase {
5554

56-
private static final int OPEN_READ_ITERATIONS = 20;
55+
private static final int OPEN_READ_ITERATIONS = 10;
56+
57+
public static final int INCONSISTENCY_MSEC = 800;
58+
59+
private static final int INITIAL_RETRY = 128;
60+
61+
private static final int RETRIES = 4;
62+
63+
/** By using a power of 2 for the initial time, the total is a shift left. */
64+
private static final int TOTAL_RETRY_DELAY = INITIAL_RETRY << RETRIES;
5765

5866
@Override
59-
protected AbstractFSContract createContract(Configuration conf) {
67+
protected Configuration createConfiguration() {
68+
Configuration conf = super.createConfiguration();
69+
// reduce retry limit so FileNotFoundException cases timeout faster,
70+
// speeding up the tests
71+
removeBaseAndBucketOverrides(conf,
72+
CHANGE_DETECT_SOURCE,
73+
CHANGE_DETECT_MODE,
74+
RETRY_LIMIT,
75+
RETRY_INTERVAL,
76+
METADATASTORE_AUTHORITATIVE,
77+
S3_CLIENT_FACTORY_IMPL);
6078
conf.setClass(S3_CLIENT_FACTORY_IMPL, InconsistentS3ClientFactory.class,
6179
S3ClientFactory.class);
6280
conf.set(FAIL_INJECT_INCONSISTENCY_KEY, DEFAULT_DELAY_KEY_SUBSTRING);
81+
// the reads are always inconsistent
6382
conf.setFloat(FAIL_INJECT_INCONSISTENCY_PROBABILITY, 1.0f);
64-
conf.setLong(FAIL_INJECT_INCONSISTENCY_MSEC, DEFAULT_DELAY_KEY_MSEC);
65-
return new S3AContract(conf);
83+
// but the inconsistent time is less than exponential growth of the
84+
// retry interval (128 -> 256 -> 512 -> 1024
85+
conf.setLong(FAIL_INJECT_INCONSISTENCY_MSEC, INCONSISTENCY_MSEC);
86+
conf.setInt(RETRY_LIMIT, RETRIES);
87+
conf.set(RETRY_INTERVAL, String.format("%dms", INITIAL_RETRY));
88+
return conf;
6689
}
6790

6891
@Test
@@ -111,7 +134,7 @@ public void testGetFileStatus() throws Exception {
111134
public void testOpenDeleteRead() throws Exception {
112135
S3AFileSystem fs = getFileSystem();
113136
ChangeDetectionPolicy changeDetectionPolicy =
114-
((S3AFileSystem) fs).getChangeDetectionPolicy();
137+
fs.getChangeDetectionPolicy();
115138
Assume.assumeFalse("FNF not expected when using a bucket with"
116139
+ " object versioning",
117140
changeDetectionPolicy.getSource() == Source.VersionId);
@@ -124,7 +147,7 @@ public void testOpenDeleteRead() throws Exception {
124147
fs.setMetadataStore(new NullMetadataStore());
125148
fs.delete(p, false);
126149
fs.setMetadataStore(metadataStore);
127-
eventually(1000, 200, () -> {
150+
eventually(TOTAL_RETRY_DELAY * 2, INITIAL_RETRY * 2, () -> {
128151
intercept(FileNotFoundException.class, () -> s.read());
129152
});
130153
}

hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARemoteFileChanged.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@
6161
import static org.apache.hadoop.fs.contract.ContractTestUtils.readUTF8;
6262
import static org.apache.hadoop.fs.contract.ContractTestUtils.writeDataset;
6363
import static org.apache.hadoop.fs.s3a.Constants.*;
64-
import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName;
65-
import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBucketOverrides;
64+
import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
6665
import static org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.CHANGE_DETECTED;
6766
import static org.apache.hadoop.fs.s3a.select.SelectConstants.S3_SELECT_CAPABILITY;
6867
import static org.apache.hadoop.fs.s3a.select.SelectConstants.SELECT_SQL;
@@ -123,8 +122,8 @@ public class ITestS3ARemoteFileChanged extends AbstractS3ATestBase {
123122

124123
private static final byte[] TEST_DATA_BYTES = TEST_DATA.getBytes(
125124
Charsets.UTF_8);
126-
private static final int TEST_MAX_RETRIES = 5;
127-
private static final String TEST_RETRY_INTERVAL = "10ms";
125+
private static final int TEST_MAX_RETRIES = 4;
126+
private static final String TEST_RETRY_INTERVAL = "1ms";
128127
private static final String QUOTED_TEST_DATA =
129128
"\"" + TEST_DATA + "\"";
130129

@@ -276,8 +275,7 @@ public void teardown() throws Exception {
276275
@Override
277276
protected Configuration createConfiguration() {
278277
Configuration conf = super.createConfiguration();
279-
String bucketName = getTestBucketName(conf);
280-
removeBucketOverrides(bucketName, conf,
278+
removeBaseAndBucketOverrides(conf,
281279
CHANGE_DETECT_SOURCE,
282280
CHANGE_DETECT_MODE,
283281
RETRY_LIMIT,

hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardOutOfBandOperations.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,13 @@
5353
import static org.apache.hadoop.fs.contract.ContractTestUtils.readBytesToString;
5454
import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
5555
import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile;
56+
import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_MODE;
57+
import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_SOURCE;
5658
import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_METADATASTORE_METADATA_TTL;
5759
import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
5860
import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_METADATA_TTL;
61+
import static org.apache.hadoop.fs.s3a.Constants.RETRY_INTERVAL;
62+
import static org.apache.hadoop.fs.s3a.Constants.RETRY_LIMIT;
5963
import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
6064
import static org.apache.hadoop.fs.s3a.S3ATestUtils.checkListingContainsPath;
6165
import static org.apache.hadoop.fs.s3a.S3ATestUtils.checkListingDoesNotContainPath;
@@ -115,7 +119,7 @@ public class ITestS3GuardOutOfBandOperations extends AbstractS3ATestBase {
115119

116120
public static final int STABILIZATION_TIME = 20_000;
117121

118-
public static final int PROBE_INTERVAL_MILLIS = 500;
122+
public static final int PROBE_INTERVAL_MILLIS = 2500;
119123

120124
private S3AFileSystem guardedFs;
121125
private S3AFileSystem rawFS;
@@ -153,6 +157,19 @@ protected String getMethodName() {
153157
(authoritative ? "-auth" : "-nonauth");
154158
}
155159

160+
@Override
161+
protected Configuration createConfiguration() {
162+
Configuration conf = super.createConfiguration();
163+
// reduce retry limit so FileNotFoundException cases timeout faster,
164+
// speeding up the tests
165+
removeBaseAndBucketOverrides(conf,
166+
RETRY_LIMIT,
167+
RETRY_INTERVAL);
168+
conf.setInt(RETRY_LIMIT, 3);
169+
conf.set(RETRY_INTERVAL, "10ms");
170+
return conf;
171+
}
172+
156173
@Before
157174
public void setup() throws Exception {
158175
super.setup();

0 commit comments

Comments
 (0)