Skip to content

Commit 2c238ae

Browse files
committed
HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu.
1 parent 972f1f1 commit 2c238ae

File tree

6 files changed

+88
-70
lines changed

6 files changed

+88
-70
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,9 @@ Release 2.7.0 - UNRELEASED
774774

775775
HDFS-7962. Remove duplicated logs in BlockManager. (yliu)
776776

777+
HDFS-7917. Use file to replace data dirs in test to simulate a disk failure.
778+
(Lei (Eddy) Xu via cnauroth)
779+
777780
OPTIMIZATIONS
778781

779782
HDFS-7454. Reduce memory footprint for AclEntries in NameNode.

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
* Utility class for accessing package-private DataNode information during tests.
4141
*
4242
*/
43-
public class DataNodeTestUtils {
43+
public class DataNodeTestUtils {
44+
private static final String DIR_FAILURE_SUFFIX = ".origin";
45+
4446
public static DatanodeRegistration
4547
getDNRegistrationForBP(DataNode dn, String bpid) throws IOException {
4648
return dn.getDNRegistrationForBP(bpid);
@@ -159,4 +161,61 @@ public static ReplicaInfo fetchReplicaInfo(final DataNode dn,
159161
final String bpid, final long blkId) {
160162
return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId);
161163
}
164+
165+
/**
166+
* It injects disk failures to data dirs by replacing these data dirs with
167+
* regular files.
168+
*
169+
* @param dirs data directories.
170+
* @throws IOException on I/O error.
171+
*/
172+
public static void injectDataDirFailure(File... dirs) throws IOException {
173+
for (File dir : dirs) {
174+
File renamedTo = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
175+
if (renamedTo.exists()) {
176+
throw new IOException(String.format(
177+
"Can not inject failure to dir: %s because %s exists.",
178+
dir, renamedTo));
179+
}
180+
if (!dir.renameTo(renamedTo)) {
181+
throw new IOException(String.format("Failed to rename %s to %s.",
182+
dir, renamedTo));
183+
}
184+
if (!dir.createNewFile()) {
185+
throw new IOException(String.format(
186+
"Failed to create file %s to inject disk failure.", dir));
187+
}
188+
}
189+
}
190+
191+
/**
192+
* Restore the injected data dir failures.
193+
*
194+
* @see {@link #injectDataDirFailures}.
195+
* @param dirs data directories.
196+
* @throws IOException
197+
*/
198+
public static void restoreDataDirFromFailure(File... dirs)
199+
throws IOException {
200+
for (File dir : dirs) {
201+
File renamedDir = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
202+
if (renamedDir.exists()) {
203+
if (dir.exists()) {
204+
if (!dir.isFile()) {
205+
throw new IOException(
206+
"Injected failure data dir is supposed to be file: " + dir);
207+
}
208+
if (!dir.delete()) {
209+
throw new IOException(
210+
"Failed to delete injected failure data dir: " + dir);
211+
}
212+
}
213+
if (!renamedDir.renameTo(dir)) {
214+
throw new IOException(String.format(
215+
"Failed to recover injected failure data dir %s to %s.",
216+
renamedDir, dir));
217+
}
218+
}
219+
}
220+
}
162221
}

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.hadoop.fs.FSDataOutputStream;
2727
import org.apache.hadoop.fs.FileStatus;
2828
import org.apache.hadoop.fs.FileSystem;
29-
import org.apache.hadoop.fs.FileUtil;
3029
import org.apache.hadoop.fs.Path;
3130
import org.apache.hadoop.hdfs.BlockMissingException;
3231
import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -682,26 +681,18 @@ public void testDirectlyReloadAfterCheckDiskError()
682681
failedVolume != null);
683682
long used = failedVolume.getDfsUsed();
684683

685-
try {
686-
assertTrue("Couldn't chmod local vol: " + dirToFail,
687-
FileUtil.setExecutable(dirToFail, false));
688-
// Call and wait DataNode to detect disk failure.
689-
long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
690-
dn.checkDiskErrorAsync();
691-
while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
692-
Thread.sleep(100);
693-
}
694-
695-
createFile(new Path("/test1"), 32, (short)2);
696-
assertEquals(used, failedVolume.getDfsUsed());
697-
} finally {
698-
// Need to restore the mode on dirToFail. Otherwise, if an Exception
699-
// is thrown above, the following tests can not delete this data directory
700-
// and thus fail to start MiniDFSCluster.
701-
assertTrue("Couldn't restore executable for: " + dirToFail,
702-
FileUtil.setExecutable(dirToFail, true));
684+
DataNodeTestUtils.injectDataDirFailure(dirToFail);
685+
// Call and wait DataNode to detect disk failure.
686+
long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
687+
dn.checkDiskErrorAsync();
688+
while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
689+
Thread.sleep(100);
703690
}
704691

692+
createFile(new Path("/test1"), 32, (short)2);
693+
assertEquals(used, failedVolume.getDfsUsed());
694+
695+
DataNodeTestUtils.restoreDataDirFromFailure(dirToFail);
705696
dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir);
706697

707698
createFile(new Path("/test2"), 32, (short)2);

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,6 @@ public void tearDown() throws Exception {
121121
if(cluster != null) {
122122
cluster.shutdown();
123123
}
124-
for (int i = 0; i < 3; i++) {
125-
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
126-
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
127-
}
128124
}
129125

130126
/*
@@ -159,7 +155,7 @@ public void testVolumeFailure() throws Exception {
159155
!deteteBlocks(failedDir)
160156
) {
161157
throw new IOException("Could not delete hdfs directory '" + failedDir + "'");
162-
}
158+
}
163159
data_fail.setReadOnly();
164160
failedDir.setReadOnly();
165161
System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists());
@@ -217,7 +213,7 @@ public void testFailedVolumeBeingRemovedFromDataNode()
217213
DFSTestUtil.waitReplication(fs, file1, (short) 2);
218214

219215
File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
220-
assertTrue(FileUtil.setExecutable(dn0Vol1, false));
216+
DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
221217
DataNode dn0 = cluster.getDataNodes().get(0);
222218
long lastDiskErrorCheck = dn0.getLastDiskErrorCheck();
223219
dn0.checkDiskErrorAsync();
@@ -291,8 +287,7 @@ public void testUnderReplicationAfterVolFailure() throws Exception {
291287
// Fail the first volume on both datanodes
292288
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
293289
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
294-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
295-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
290+
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
296291

297292
Path file2 = new Path("/test2");
298293
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.apache.hadoop.conf.Configuration;
3535
import org.apache.hadoop.conf.ReconfigurationException;
3636
import org.apache.hadoop.fs.FileSystem;
37-
import org.apache.hadoop.fs.FileUtil;
3837
import org.apache.hadoop.fs.Path;
3938
import org.apache.hadoop.hdfs.DFSConfigKeys;
4039
import org.apache.hadoop.hdfs.DFSTestUtil;
@@ -87,19 +86,6 @@ public void setUp() throws Exception {
8786

8887
@After
8988
public void tearDown() throws Exception {
90-
// Restore executable permission on all directories where a failure may have
91-
// been simulated by denying execute access. This is based on the maximum
92-
// number of datanodes and the maximum number of storages per data node used
93-
// throughout the tests in this suite.
94-
assumeTrue(!Path.WINDOWS);
95-
int maxDataNodes = 3;
96-
int maxStoragesPerDataNode = 4;
97-
for (int i = 0; i < maxDataNodes; i++) {
98-
for (int j = 1; j <= maxStoragesPerDataNode; j++) {
99-
String subDir = "data" + ((i * maxStoragesPerDataNode) + j);
100-
FileUtil.setExecutable(new File(dataDir, subDir), true);
101-
}
102-
}
10389
IOUtils.cleanup(LOG, fs);
10490
if (cluster != null) {
10591
cluster.shutdown();
@@ -141,8 +127,7 @@ public void testSuccessiveVolumeFailures() throws Exception {
141127
* fail. The client does not retry failed nodes even though
142128
* perhaps they could succeed because just a single volume failed.
143129
*/
144-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
145-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
130+
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
146131

147132
/*
148133
* Create file1 and wait for 3 replicas (ie all DNs can still
@@ -179,7 +164,7 @@ public void testSuccessiveVolumeFailures() throws Exception {
179164
* Now fail a volume on the third datanode. We should be able to get
180165
* three replicas since we've already identified the other failures.
181166
*/
182-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false));
167+
DataNodeTestUtils.injectDataDirFailure(dn3Vol1);
183168
Path file2 = new Path("/test2");
184169
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
185170
DFSTestUtil.waitReplication(fs, file2, (short)3);
@@ -208,7 +193,7 @@ public void testSuccessiveVolumeFailures() throws Exception {
208193
* and that it's no longer up. Only wait for two replicas since
209194
* we'll never get a third.
210195
*/
211-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false));
196+
DataNodeTestUtils.injectDataDirFailure(dn3Vol2);
212197
Path file3 = new Path("/test3");
213198
DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
214199
DFSTestUtil.waitReplication(fs, file3, (short)2);
@@ -233,10 +218,8 @@ public void testSuccessiveVolumeFailures() throws Exception {
233218
* restart, so file creation should be able to succeed after
234219
* restoring the data directories and restarting the datanodes.
235220
*/
236-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
237-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
238-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true));
239-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true));
221+
DataNodeTestUtils.restoreDataDirFromFailure(
222+
dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2);
240223
cluster.restartDataNodes();
241224
cluster.waitActive();
242225
Path file4 = new Path("/test4");
@@ -275,8 +258,7 @@ public void testVolFailureStatsPreservedOnNNRestart() throws Exception {
275258
// third healthy so one node in the pipeline will not fail).
276259
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
277260
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
278-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
279-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
261+
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
280262

281263
Path file1 = new Path("/test1");
282264
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@@ -323,14 +305,7 @@ public void testMultipleVolFailuresOnNode() throws Exception {
323305

324306
// Make the first two volume directories on the first two datanodes
325307
// non-accessible.
326-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1,
327-
false));
328-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol2,
329-
false));
330-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1,
331-
false));
332-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol2,
333-
false));
308+
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
334309

335310
// Create file1 and wait for 3 replicas (ie all DNs can still store a block).
336311
// Then assert that all DNs are up, despite the volume failures.
@@ -380,8 +355,8 @@ public void testDataNodeReconfigureWithVolumeFailures() throws Exception {
380355
File dn1Vol2 = new File(dataDir, "data"+(2*0+2));
381356
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
382357
File dn2Vol2 = new File(dataDir, "data"+(2*1+2));
383-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
384-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
358+
DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
359+
DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
385360

386361
Path file1 = new Path("/test1");
387362
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@@ -449,8 +424,7 @@ public void testDataNodeReconfigureWithVolumeFailures() throws Exception {
449424

450425
// Replace failed volume with healthy volume and run reconfigure DataNode.
451426
// The failed volume information should be cleared.
452-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
453-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
427+
DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
454428
reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
455429
reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
456430

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,6 @@ public void setUp() throws Exception {
7676

7777
@After
7878
public void tearDown() throws Exception {
79-
for (int i = 0; i < 3; i++) {
80-
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
81-
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
82-
}
8379
cluster.shutdown();
8480
}
8581

@@ -152,7 +148,7 @@ public void testConfigureMinValidVolumes() throws Exception {
152148

153149
// Fail a volume on the 2nd DN
154150
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
155-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
151+
DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
156152

157153
// Should only get two replicas (the first DN and the 3rd)
158154
Path file1 = new Path("/test1");
@@ -165,7 +161,7 @@ public void testConfigureMinValidVolumes() throws Exception {
165161

166162
// If we restore the volume we should still only be able to get
167163
// two replicas since the DN is still considered dead.
168-
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
164+
DataNodeTestUtils.restoreDataDirFromFailure(dn2Vol1);
169165
Path file2 = new Path("/test2");
170166
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
171167
DFSTestUtil.waitReplication(fs, file2, (short)2);

0 commit comments

Comments
 (0)