Skip to content

Commit ccceedb

Browse files
committed
HDDS-1780. TestFailureHandlingByClient tests are flaky. Contributed by Shashikant Banerjee. (#1073)
1 parent 23e9beb commit ccceedb

File tree

4 files changed

+180
-49
lines changed

4 files changed

+180
-49
lines changed

hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientGrpc.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ private XceiverClientReply sendCommandWithRetry(
285285
}
286286
break;
287287
} catch (ExecutionException | InterruptedException | IOException e) {
288-
LOG.debug("Failed to execute command " + request + " on datanode " + dn
288+
LOG.error("Failed to execute command " + request + " on datanode " + dn
289289
.getUuidString(), e);
290290
if (!(e instanceof IOException)) {
291291
if (Status.fromThrowable(e.getCause()).getCode()
@@ -306,8 +306,8 @@ private XceiverClientReply sendCommandWithRetry(
306306
return reply;
307307
} else {
308308
Preconditions.checkNotNull(ioException);
309-
LOG.error("Failed to execute command " + request + " on the pipeline "
310-
+ pipeline.getId());
309+
LOG.error("Failed to execute command {} on the pipeline {}.", request,
310+
pipeline);
311311
throw ioException;
312312
}
313313
}

hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockOutputStream.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
import java.io.IOException;
4343
import java.io.OutputStream;
4444
import java.nio.ByteBuffer;
45-
import java.util.Collections;
4645
import java.util.List;
4746
import java.util.ArrayList;
4847
import java.util.Map;
@@ -160,7 +159,7 @@ public BlockOutputStream(BlockID blockID,
160159
bufferList = null;
161160
totalDataFlushedLength = 0;
162161
writtenDataLength = 0;
163-
failedServers = Collections.emptyList();
162+
failedServers = new ArrayList<>(0);
164163
ioException = new AtomicReference<>(null);
165164
}
166165

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestFailureHandlingByClient.java

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
2323
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
2424
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
25+
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
2526
import org.apache.hadoop.hdds.scm.container.ContainerID;
2627
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
2728
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
@@ -81,11 +82,16 @@ private void init() throws Exception {
8182
conf.setTimeDuration(OzoneConfigKeys.OZONE_CLIENT_WATCH_REQUEST_TIMEOUT, 5,
8283
TimeUnit.SECONDS);
8384
conf.setTimeDuration(HDDS_SCM_WATCHER_TIMEOUT, 1000, TimeUnit.MILLISECONDS);
84-
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS);
85-
conf.setInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, 5);
85+
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 100, TimeUnit.SECONDS);
86+
conf.setInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, 10);
8687
conf.setTimeDuration(
8788
OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY,
8889
1, TimeUnit.SECONDS);
90+
conf.setTimeDuration(
91+
OzoneConfigKeys.DFS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
92+
1, TimeUnit.SECONDS);
93+
conf.setBoolean(
94+
ScmConfigKeys.DFS_NETWORK_TOPOLOGY_AWARE_READ_ENABLED, false);
8995

9096
conf.setQuietMode(false);
9197
cluster = MiniOzoneCluster.newBuilder(conf)
@@ -156,48 +162,6 @@ public void testBlockWritesWithDnFailures() throws Exception {
156162
shutdown();
157163
}
158164

159-
@Test
160-
public void testMultiBlockWritesWithDnFailures() throws Exception {
161-
startCluster();
162-
String keyName = "ratis3";
163-
OzoneOutputStream key = createKey(keyName, ReplicationType.RATIS, 0);
164-
String data =
165-
ContainerTestHelper
166-
.getFixedLengthString(keyString, blockSize + chunkSize);
167-
key.write(data.getBytes());
168-
169-
// get the name of a valid container
170-
Assert.assertTrue(key.getOutputStream() instanceof KeyOutputStream);
171-
KeyOutputStream groupOutputStream =
172-
(KeyOutputStream) key.getOutputStream();
173-
List<OmKeyLocationInfo> locationInfoList =
174-
groupOutputStream.getLocationInfoList();
175-
Assert.assertTrue(locationInfoList.size() == 2);
176-
long containerId = locationInfoList.get(1).getContainerID();
177-
ContainerInfo container = cluster.getStorageContainerManager()
178-
.getContainerManager()
179-
.getContainer(ContainerID.valueof(containerId));
180-
Pipeline pipeline =
181-
cluster.getStorageContainerManager().getPipelineManager()
182-
.getPipeline(container.getPipelineID());
183-
List<DatanodeDetails> datanodes = pipeline.getNodes();
184-
cluster.shutdownHddsDatanode(datanodes.get(0));
185-
cluster.shutdownHddsDatanode(datanodes.get(1));
186-
187-
// The write will fail but exception will be handled and length will be
188-
// updated correctly in OzoneManager once the steam is closed
189-
key.write(data.getBytes());
190-
key.close();
191-
OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName)
192-
.setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS)
193-
.setFactor(HddsProtos.ReplicationFactor.THREE).setKeyName(keyName)
194-
.setRefreshPipeline(true)
195-
.build();
196-
OmKeyInfo keyInfo = cluster.getOzoneManager().lookupKey(keyArgs);
197-
Assert.assertEquals(2 * data.getBytes().length, keyInfo.getDataSize());
198-
validateData(keyName, data.concat(data).getBytes());
199-
shutdown();
200-
}
201165

202166
@Test
203167
public void testMultiBlockWritesWithIntermittentDnFailures()
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with this
4+
* work for additional information regarding copyright ownership. The ASF
5+
* licenses this file to you under the Apache License, Version 2.0 (the
6+
* "License"); you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
* <p>
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
* <p>
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
* License for the specific language governing permissions and limitations under
15+
* the License.
16+
*/
17+
18+
package org.apache.hadoop.ozone.client.rpc;
19+
20+
import org.apache.hadoop.hdds.client.ReplicationType;
21+
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
22+
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
23+
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
24+
import org.apache.hadoop.hdds.scm.container.ContainerID;
25+
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
26+
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
27+
import org.apache.hadoop.ozone.MiniOzoneCluster;
28+
import org.apache.hadoop.ozone.OzoneConfigKeys;
29+
import org.apache.hadoop.ozone.OzoneConsts;
30+
import org.apache.hadoop.ozone.client.ObjectStore;
31+
import org.apache.hadoop.ozone.client.OzoneClient;
32+
import org.apache.hadoop.ozone.client.OzoneClientFactory;
33+
import org.apache.hadoop.ozone.client.io.KeyOutputStream;
34+
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
35+
import org.apache.hadoop.ozone.container.ContainerTestHelper;
36+
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
37+
import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
38+
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
39+
import org.junit.Assert;
40+
import org.junit.Test;
41+
42+
import java.io.IOException;
43+
import java.util.List;
44+
import java.util.UUID;
45+
import java.util.concurrent.TimeUnit;
46+
47+
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_SCM_WATCHER_TIMEOUT;
48+
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
49+
50+
/**
51+
* Tests MultiBlock Writes with Dn failures by Ozone Client.
52+
*/
53+
public class TestMultiBlockWritesWithDnFailures {
54+
55+
private MiniOzoneCluster cluster;
56+
private OzoneConfiguration conf;
57+
private OzoneClient client;
58+
private ObjectStore objectStore;
59+
private int chunkSize;
60+
private int blockSize;
61+
private String volumeName;
62+
private String bucketName;
63+
private String keyString;
64+
private int maxRetries;
65+
66+
/**
67+
* Create a MiniDFSCluster for testing.
68+
* <p>
69+
* Ozone is made active by setting OZONE_ENABLED = true
70+
*
71+
* @throws IOException
72+
*/
73+
private void init() throws Exception {
74+
conf = new OzoneConfiguration();
75+
maxRetries = 100;
76+
chunkSize = (int) OzoneConsts.MB;
77+
blockSize = 4 * chunkSize;
78+
conf.setTimeDuration(OzoneConfigKeys.OZONE_CLIENT_WATCH_REQUEST_TIMEOUT, 5,
79+
TimeUnit.SECONDS);
80+
conf.setTimeDuration(HDDS_SCM_WATCHER_TIMEOUT, 1000, TimeUnit.MILLISECONDS);
81+
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 100, TimeUnit.SECONDS);
82+
conf.setInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, 5);
83+
conf.setTimeDuration(
84+
OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY,
85+
1, TimeUnit.SECONDS);
86+
87+
conf.setQuietMode(false);
88+
cluster = MiniOzoneCluster.newBuilder(conf)
89+
.setNumDatanodes(6).build();
90+
cluster.waitForClusterToBeReady();
91+
//the easiest way to create an open container is creating a key
92+
client = OzoneClientFactory.getClient(conf);
93+
objectStore = client.getObjectStore();
94+
keyString = UUID.randomUUID().toString();
95+
volumeName = "datanodefailurehandlingtest";
96+
bucketName = volumeName;
97+
objectStore.createVolume(volumeName);
98+
objectStore.getVolume(volumeName).createBucket(bucketName);
99+
}
100+
101+
private void startCluster() throws Exception {
102+
init();
103+
}
104+
105+
/**
106+
* Shutdown MiniDFSCluster.
107+
*/
108+
private void shutdown() {
109+
if (cluster != null) {
110+
cluster.shutdown();
111+
}
112+
}
113+
114+
@Test
115+
public void testMultiBlockWritesWithDnFailures() throws Exception {
116+
startCluster();
117+
String keyName = "ratis3";
118+
OzoneOutputStream key = createKey(keyName, ReplicationType.RATIS, 0);
119+
String data =
120+
ContainerTestHelper
121+
.getFixedLengthString(keyString, blockSize + chunkSize);
122+
key.write(data.getBytes());
123+
124+
// get the name of a valid container
125+
Assert.assertTrue(key.getOutputStream() instanceof KeyOutputStream);
126+
KeyOutputStream groupOutputStream =
127+
(KeyOutputStream) key.getOutputStream();
128+
List<OmKeyLocationInfo> locationInfoList =
129+
groupOutputStream.getLocationInfoList();
130+
Assert.assertTrue(locationInfoList.size() == 2);
131+
long containerId = locationInfoList.get(1).getContainerID();
132+
ContainerInfo container = cluster.getStorageContainerManager()
133+
.getContainerManager()
134+
.getContainer(ContainerID.valueof(containerId));
135+
Pipeline pipeline =
136+
cluster.getStorageContainerManager().getPipelineManager()
137+
.getPipeline(container.getPipelineID());
138+
List<DatanodeDetails> datanodes = pipeline.getNodes();
139+
cluster.shutdownHddsDatanode(datanodes.get(0));
140+
cluster.shutdownHddsDatanode(datanodes.get(1));
141+
142+
// The write will fail but exception will be handled and length will be
143+
// updated correctly in OzoneManager once the steam is closed
144+
key.write(data.getBytes());
145+
key.close();
146+
OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName)
147+
.setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS)
148+
.setFactor(HddsProtos.ReplicationFactor.THREE).setKeyName(keyName)
149+
.setRefreshPipeline(true)
150+
.build();
151+
OmKeyInfo keyInfo = cluster.getOzoneManager().lookupKey(keyArgs);
152+
Assert.assertEquals(2 * data.getBytes().length, keyInfo.getDataSize());
153+
validateData(keyName, data.concat(data).getBytes());
154+
shutdown();
155+
}
156+
157+
private OzoneOutputStream createKey(String keyName, ReplicationType type,
158+
long size) throws Exception {
159+
return ContainerTestHelper
160+
.createKey(keyName, type, size, objectStore, volumeName, bucketName);
161+
}
162+
163+
private void validateData(String keyName, byte[] data) throws Exception {
164+
ContainerTestHelper
165+
.validateData(keyName, data, objectStore, volumeName, bucketName);
166+
}
167+
168+
}

0 commit comments

Comments
 (0)