Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class TestFileChecksum {
.getLogger(TestFileChecksum.class);
private final ErasureCodingPolicy ecPolicy =
StripedFileTestUtil.getDefaultECPolicy();
private final static long STALE_INTERVAL = 2000;
private int dataBlocks = ecPolicy.getNumDataUnits();
private int parityBlocks = ecPolicy.getNumParityUnits();

Expand Down Expand Up @@ -88,9 +89,20 @@ public void setup() throws IOException {
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_CONSIDERLOAD_KEY,
false);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 0);
conf.setLong(DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
STALE_INTERVAL);
conf.setBoolean(DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
(int) (STALE_INTERVAL / 2));
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY,
(int) (STALE_INTERVAL / 4));
conf.setInt(
DFSConfigKeys.DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_KEY, 4);
customizeConf(conf);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDNs).build();
cluster.waitClusterUp();
Path ecPath = new Path(ecDir);
cluster.getFileSystem().mkdir(ecPath, FsPermission.getDirDefault());
cluster.getFileSystem().getClient().setErasureCodingPolicy(ecDir,
Expand All @@ -107,6 +119,22 @@ public void setup() throws IOException {

@After
public void tearDown() {
// delete the directory
Path ecPath = new Path(ecDir);
try {
fs.delete(ecPath, true);
} catch (Exception ex) {
LOG.error("Could not delete ecDir", ex);
}

if (client != null) {
try {
client.close();
} catch (IOException e) {
LOG.error("Error closing the fsClient", e);
}
}

if (cluster != null) {
cluster.shutdown();
cluster = null;
Expand Down Expand Up @@ -475,7 +503,6 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery16()
throws Exception {
int fileLength = 100;
String stripedFile3 = ecDir + "/stripedFileChecksum3";
prepareTestFiles(fileLength, new String[] {stripedFile3});
testStripedFileChecksumWithMissedDataBlocksRangeQuery(stripedFile3,
fileLength - 1);
}
Expand All @@ -487,9 +514,7 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery16()
@Test(timeout = 90000)
public void testStripedFileChecksumWithMissedDataBlocksRangeQuery17()
throws Exception {
int fileLength = 100;
String stripedFile3 = ecDir + "/stripedFileChecksum3";
prepareTestFiles(fileLength, new String[] {stripedFile3});
testStripedFileChecksumWithMissedDataBlocksRangeQuery(stripedFile3, 1);
}

Expand All @@ -502,7 +527,6 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery18()
throws Exception {
int fileLength = 100;
String stripedFile3 = ecDir + "/stripedFileChecksum3";
prepareTestFiles(fileLength, new String[] {stripedFile3});
testStripedFileChecksumWithMissedDataBlocksRangeQuery(stripedFile3, 10);
}

Expand All @@ -515,7 +539,6 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery19()
throws Exception {
int fileLength = 100;
String stripedFile3 = ecDir + "/stripedFileChecksum3";
prepareTestFiles(fileLength, new String[] {stripedFile3});
testStripedFileChecksumWithMissedDataBlocksRangeQuery(stripedFile3,
fileLength * 2);
}
Expand All @@ -527,9 +550,7 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery19()
@Test(timeout = 90000)
public void testStripedFileChecksumWithMissedDataBlocksRangeQuery20()
throws Exception {
int fileLength = bytesPerCRC;
String stripedFile3 = ecDir + "/stripedFileChecksum3";
prepareTestFiles(fileLength, new String[] {stripedFile3});
testStripedFileChecksumWithMissedDataBlocksRangeQuery(stripedFile3,
bytesPerCRC - 1);
}
Expand Down Expand Up @@ -575,6 +596,8 @@ private FileChecksum getFileChecksum(String filePath, int range,
dnIdxToDie = getDataNodeToKill(filePath);
DataNode dnToDie = cluster.getDataNodes().get(dnIdxToDie);
shutdownDataNode(dnToDie);
// wait enough time for the locations to be updated.
Thread.sleep(STALE_INTERVAL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we waitFor instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree with you @goiri .
I experimented with wait for number of live replicas: this did not work. It stayed 8 and did not go back to 9.
Do you have suggestion what conditions should we be waiting for?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not very close to this part of the code but there must be ways to force the statistics to update.
Not sure who can help with this part of the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. the problem is that I cannot reproduce it on my local machine. However, it seems that it fails in a consistent way on Yetus.
If it is not a real bug, I wonder if volumeScanner could be a factor in randomly slowing down the DNs. I see many log message from the volume scanner when I run locally.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see there is truncation there too though.
We may want to make the test a little less verbose.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @goiri
Those are the logs I was looking at. All logs in TestFileChecksum and TestFileChecksumCompositeCrc truncate the last 9 seconds prior to the failure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could reproduce the failure locally:

$ ./start-build-env.sh
$ mvn clean install -DskipTests -Pnative
$ cd hadoop-hdfs-project/hadoop-hdfs
$ mvn test -Pnative -Pparallel-tests

Attached the stdout in the JIRA: https://issues.apache.org/jira/secure/attachment/13014321/org.apache.hadoop.hdfs.TestFileChecksum-output.txt

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could reproduce even without -Pparallel-tests

$ pwd
/home/aajisaka/hadoop/hadoop-hdfs-project/hadoop-hdfs
$ mvn test -Dtest=TestFileChecksum -Pnative

}

Path testPath = new Path(filePath);
Expand All @@ -588,6 +611,7 @@ private FileChecksum getFileChecksum(String filePath, int range,

if (dnIdxToDie != -1) {
cluster.restartDataNode(dnIdxToDie);
cluster.waitActive();
}

return fc;
Expand Down