Skip to content

Commit 3758270

Browse files
author
Hanisha Koneru
authored
HDDS-1403. KeyOutputStream writes fails after max retries while writing to a closed container (#753)
1 parent 556eafd commit 3758270

File tree

7 files changed

+69
-10
lines changed

7 files changed

+69
-10
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
import org.apache.ratis.proto.RaftProtos.ReplicationLevel;
2828
import org.apache.ratis.util.TimeDuration;
2929

30-
/**
30+
import java.util.concurrent.TimeUnit;
31+
32+
/**
3133
* This class contains constants for configuration keys used in Ozone.
3234
*/
3335
@InterfaceAudience.Public
@@ -140,8 +142,11 @@ public final class OzoneConfigKeys {
140142

141143
public static final String OZONE_CLIENT_MAX_RETRIES =
142144
"ozone.client.max.retries";
143-
public static final int OZONE_CLIENT_MAX_RETRIES_DEFAULT = 5;
144-
145+
public static final int OZONE_CLIENT_MAX_RETRIES_DEFAULT = 100;
146+
public static final String OZONE_CLIENT_RETRY_INTERVAL =
147+
"ozone.client.retry.interval";
148+
public static final TimeDuration OZONE_CLIENT_RETRY_INTERVAL_DEFAULT =
149+
TimeDuration.valueOf(0, TimeUnit.MILLISECONDS);
145150

146151
// This defines the overall connection limit for the connection pool used in
147152
// RestClient.

hadoop-hdds/common/src/main/resources/ozone-default.xml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,12 +429,21 @@
429429
</property>
430430
<property>
431431
<name>ozone.client.max.retries</name>
432-
<value>5</value>
432+
<value>100</value>
433433
<tag>OZONE, CLIENT</tag>
434434
<description>Maximum number of retries by Ozone Client on encountering
435435
exception while writing a key.
436436
</description>
437437
</property>
438+
<property>
439+
<name>ozone.client.retry.interval</name>
440+
<value>0ms</value>
441+
<tag>OZONE, CLIENT</tag>
442+
<description>Indicates the time duration a client will wait before
443+
retrying a write key request on encountering an exception. By default
444+
there is no wait.
445+
</description>
446+
</property>
438447
<property>
439448
<name>ozone.client.protocol</name>
440449
<value>org.apache.hadoop.ozone.client.rpc.RpcClient</value>

hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/OzoneClientUtils.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,11 @@ public static KeyInfoDetails asKeyInfoDetails(OzoneKeyDetails key) {
127127
return keyInfo;
128128
}
129129

130-
public static RetryPolicy createRetryPolicy(int maxRetryCount) {
130+
public static RetryPolicy createRetryPolicy(int maxRetryCount,
131+
long retryInterval) {
131132
// just retry without sleep
132133
RetryPolicy retryPolicy = RetryPolicies
133-
.retryUpToMaximumCountWithFixedSleep(maxRetryCount, 0,
134+
.retryUpToMaximumCountWithFixedSleep(maxRetryCount, retryInterval,
134135
TimeUnit.MILLISECONDS);
135136
return retryPolicy;
136137
}

hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/KeyOutputStream.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,8 @@ public KeyOutputStream(OpenKeySession handler,
164164
String requestId, ReplicationFactor factor, ReplicationType type,
165165
long bufferFlushSize, long bufferMaxSize, long size, long watchTimeout,
166166
ChecksumType checksumType, int bytesPerChecksum,
167-
String uploadID, int partNumber, boolean isMultipart, int maxRetryCount) {
167+
String uploadID, int partNumber, boolean isMultipart,
168+
int maxRetryCount, long retryInterval) {
168169
this.streamEntries = new ArrayList<>();
169170
this.currentStreamIndex = 0;
170171
this.omClient = omClient;
@@ -199,7 +200,8 @@ public KeyOutputStream(OpenKeySession handler,
199200
this.bufferPool =
200201
new BufferPool(chunkSize, (int)streamBufferMaxSize / chunkSize);
201202
this.excludeList = new ExcludeList();
202-
this.retryPolicy = OzoneClientUtils.createRetryPolicy(maxRetryCount);
203+
this.retryPolicy = OzoneClientUtils.createRetryPolicy(maxRetryCount,
204+
retryInterval);
203205
this.retryCount = 0;
204206
}
205207

@@ -726,6 +728,7 @@ public static class Builder {
726728
private int multipartNumber;
727729
private boolean isMultipartKey;
728730
private int maxRetryCount;
731+
private long retryInterval;
729732

730733

731734
public Builder setMultipartUploadID(String uploadID) {
@@ -814,12 +817,17 @@ public Builder setMaxRetryCount(int maxCount) {
814817
return this;
815818
}
816819

820+
public Builder setRetryInterval(long retryIntervalInMS) {
821+
this.retryInterval = retryIntervalInMS;
822+
return this;
823+
}
824+
817825
public KeyOutputStream build() throws IOException {
818826
return new KeyOutputStream(openHandler, xceiverManager,
819827
omClient, chunkSize, requestID, factor, type, streamBufferFlushSize,
820828
streamBufferMaxSize, blockSize, watchTimeout, checksumType,
821829
bytesPerChecksum, multipartUploadID, multipartNumber, isMultipartKey,
822-
maxRetryCount);
830+
maxRetryCount, retryInterval);
823831
}
824832
}
825833

hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
import org.apache.hadoop.hdds.scm.protocolPB
8686
.StorageContainerLocationProtocolPB;
8787
import org.apache.hadoop.ozone.security.OzoneTokenIdentifier;
88+
import org.apache.hadoop.ozone.web.utils.OzoneUtils;
8889
import org.apache.hadoop.security.UserGroupInformation;
8990
import org.apache.hadoop.security.token.Token;
9091
import org.apache.hadoop.io.Text;
@@ -128,6 +129,7 @@ public class RpcClient implements ClientProtocol, KeyProviderTokenIssuer {
128129
private final long watchTimeout;
129130
private final ClientId clientId = ClientId.randomId();
130131
private final int maxRetryCount;
132+
private final long retryInterval;
131133
private Text dtService;
132134

133135
/**
@@ -214,6 +216,9 @@ public RpcClient(Configuration conf) throws IOException {
214216
maxRetryCount =
215217
conf.getInt(OzoneConfigKeys.OZONE_CLIENT_MAX_RETRIES, OzoneConfigKeys.
216218
OZONE_CLIENT_MAX_RETRIES_DEFAULT);
219+
retryInterval = OzoneUtils.getTimeDurationInMS(conf,
220+
OzoneConfigKeys.OZONE_CLIENT_RETRY_INTERVAL,
221+
OzoneConfigKeys.OZONE_CLIENT_RETRY_INTERVAL_DEFAULT);
217222
dtService =
218223
getOMProxyProvider().getProxy().getDelegationTokenService();
219224
boolean isUnsafeByteOperationsEnabled = conf.getBoolean(
@@ -861,6 +866,7 @@ public OzoneOutputStream createMultipartKey(String volumeName,
861866
.setMultipartUploadID(uploadID)
862867
.setIsMultipartKey(true)
863868
.setMaxRetryCount(maxRetryCount)
869+
.setRetryInterval(retryInterval)
864870
.build();
865871
keyOutputStream.addPreallocateBlocks(
866872
openKey.getKeyInfo().getLatestVersionLocations(),
@@ -1022,7 +1028,9 @@ private OzoneOutputStream createOutputStream(OpenKeySession openKey,
10221028
.setBlockSize(blockSize)
10231029
.setChecksumType(checksumType)
10241030
.setBytesPerChecksum(bytesPerChecksum)
1025-
.setMaxRetryCount(maxRetryCount).build();
1031+
.setMaxRetryCount(maxRetryCount)
1032+
.setRetryInterval(retryInterval)
1033+
.build();
10261034
keyOutputStream
10271035
.addPreallocateBlocks(openKey.getKeyInfo().getLatestVersionLocations(),
10281036
openKey.getOpenVersion());

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/web/utils/OzoneUtils.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@
2626
import java.util.Locale;
2727
import java.util.TimeZone;
2828
import java.util.UUID;
29+
import java.util.concurrent.TimeUnit;
2930

3031
import org.apache.hadoop.classification.InterfaceAudience;
3132
import org.apache.hadoop.conf.Configuration;
3233
import org.apache.hadoop.hdds.HddsUtils;
3334
import org.apache.hadoop.ozone.OzoneConsts;
3435

3536
import com.google.common.base.Preconditions;
37+
import org.apache.ratis.util.TimeDuration;
3638

3739
/**
3840
* Set of Utility functions used in ozone.
@@ -214,4 +216,24 @@ public static void verifyResourceName(String resName)
214216
}
215217
}
216218

219+
/**
220+
* Return the TimeDuration configured for the given key. If not configured,
221+
* return the default value.
222+
*/
223+
public static TimeDuration getTimeDuration(Configuration conf, String key,
224+
TimeDuration defaultValue) {
225+
TimeUnit defaultTimeUnit = defaultValue.getUnit();
226+
long timeDurationInDefaultUnit = conf.getTimeDuration(key,
227+
defaultValue.getDuration(), defaultTimeUnit);
228+
return TimeDuration.valueOf(timeDurationInDefaultUnit, defaultTimeUnit);
229+
}
230+
231+
/**
232+
* Return the time configured for the given key in milliseconds.
233+
*/
234+
public static long getTimeDurationInMS(Configuration conf, String key,
235+
TimeDuration defaultValue) {
236+
return getTimeDuration(conf, key, defaultValue)
237+
.toLong(TimeUnit.MILLISECONDS);
238+
}
217239
}

hadoop-ozone/objectstore-service/src/main/java/org/apache/hadoop/ozone/web/storage/DistributedStorageHandler.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
import org.apache.hadoop.ozone.web.handlers.UserArgs;
5858
import org.apache.hadoop.ozone.web.interfaces.StorageHandler;
5959
import org.apache.hadoop.ozone.web.response.*;
60+
import org.apache.hadoop.ozone.web.utils.OzoneUtils;
6061
import org.slf4j.Logger;
6162
import org.slf4j.LoggerFactory;
6263

@@ -90,6 +91,7 @@ public final class DistributedStorageHandler implements StorageHandler {
9091
private final int bytesPerChecksum;
9192
private final boolean verifyChecksum;
9293
private final int maxRetryCount;
94+
private final long retryInterval;
9395

9496
/**
9597
* Creates a new DistributedStorageHandler.
@@ -159,6 +161,9 @@ public DistributedStorageHandler(OzoneConfiguration conf,
159161
this.maxRetryCount =
160162
conf.getInt(OzoneConfigKeys.OZONE_CLIENT_MAX_RETRIES, OzoneConfigKeys.
161163
OZONE_CLIENT_MAX_RETRIES_DEFAULT);
164+
this.retryInterval = OzoneUtils.getTimeDurationInMS(conf,
165+
OzoneConfigKeys.OZONE_CLIENT_RETRY_INTERVAL,
166+
OzoneConfigKeys.OZONE_CLIENT_RETRY_INTERVAL_DEFAULT);
162167
boolean isUnsafeByteOperationsEnabled = conf.getBoolean(
163168
OzoneConfigKeys.OZONE_UNSAFEBYTEOPERATIONS_ENABLED,
164169
OzoneConfigKeys.OZONE_UNSAFEBYTEOPERATIONS_ENABLED_DEFAULT);
@@ -464,6 +469,7 @@ public OutputStream newKeyWriter(KeyArgs args) throws IOException,
464469
.setChecksumType(checksumType)
465470
.setBytesPerChecksum(bytesPerChecksum)
466471
.setMaxRetryCount(maxRetryCount)
472+
.setRetryInterval(retryInterval)
467473
.build();
468474
keyOutputStream.addPreallocateBlocks(
469475
openKey.getKeyInfo().getLatestVersionLocations(),

0 commit comments

Comments
 (0)