Skip to content

Commit 7f40e66

Browse files
committed
HADOOP-16746. mkdirs and s3guard Authoritative mode.
Contributed by Steve Loughran. This fixes two problems with S3Guard authoritative mode and the auth directory flags which are stored in DynamoDB. 1. mkdirs was creating dir markers without the auth bit, forcing needless scans on newly created directories and files subsequently added; it was only with the first listStatus call on that directory that the dir would be marked as authoritative -even though it would be complete already. 2. listStatus(path) would reset the authoritative status bit of all child directories even if they were already marked as authoritative. Issue #2 is possibly the most expensive, as any treewalk using listStatus (e.g globfiles) would clear the auth bit for all child directories before listing them. And this would happen every single time... essentially you weren't getting authoritative directory listings. For the curious, that the major bug was actually found during testing -we'd all missed it during reviews. A lesson there: the better the tests the fewer the bugs. Maybe also: something obvious and significant can get by code reviews. modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/BulkOperationState.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java modified: hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3Guard.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardWriteBack.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestRestrictedReadAccess.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestPartialDeleteFailures.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreAuthoritativeMode.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java modified: hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestS3Guard.java Change-Id: Ic3ffda13f2af2430afedd50fd657b595c83e90a7
1 parent 1afd54f commit 7f40e66

16 files changed

+575
-153
lines changed

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3528,7 +3528,7 @@ void finishedWrite(String key, long length, String eTag, String versionId,
35283528
// information gleaned from addAncestors is preserved into the
35293529
// subsequent put.
35303530
stateToClose = S3Guard.initiateBulkWrite(metadataStore,
3531-
BulkOperationState.OperationType.Put,
3531+
BulkOperationState.OperationType.Mkdir,
35323532
keyToPath(key));
35333533
activeState = stateToClose;
35343534
}
@@ -3537,13 +3537,20 @@ void finishedWrite(String key, long length, String eTag, String versionId,
35373537
S3AFileStatus status = createUploadFileStatus(p,
35383538
isDir, length,
35393539
getDefaultBlockSize(p), username, eTag, versionId);
3540-
if (!isDir) {
3540+
boolean authoritative = false;
3541+
if (isDir) {
3542+
// this is a directory marker so put it as such.
3543+
status.setIsEmptyDirectory(Tristate.TRUE);
3544+
// and maybe mark as auth
3545+
authoritative = allowAuthoritative(p);
3546+
}
3547+
if (!authoritative) {
3548+
// for files and non-auth directories
35413549
S3Guard.putAndReturn(metadataStore, status,
35423550
ttlTimeProvider,
35433551
activeState);
35443552
} else {
3545-
// this is a directory marker so put it as such.
3546-
status.setIsEmptyDirectory(Tristate.TRUE);
3553+
// authoritative directory
35473554
S3Guard.putAuthDirectoryMarker(metadataStore, status,
35483555
ttlTimeProvider,
35493556
activeState);

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/BulkOperationState.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,5 +98,9 @@ public enum OperationType {
9898
* Listing update.
9999
*/
100100
Listing,
101+
/**
102+
* Mkdir operation.
103+
*/
104+
Mkdir,
101105
}
102106
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@
202202
* same region. The region may also be set explicitly by setting the config
203203
* parameter {@code fs.s3a.s3guard.ddb.region} to the corresponding region.
204204
*/
205+
@SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
205206
@InterfaceAudience.Private
206207
@InterfaceStability.Evolving
207208
public class DynamoDBMetadataStore implements MetadataStore,
@@ -899,19 +900,18 @@ private Collection<DDBPathMetadata> completeAncestry(
899900
List<DDBPathMetadata> sortedPaths = new ArrayList<>(pathsToCreate);
900901
sortedPaths.sort(PathOrderComparators.TOPMOST_PM_FIRST);
901902
// iterate through the paths.
902-
for (DDBPathMetadata meta : sortedPaths) {
903-
Preconditions.checkArgument(meta != null);
904-
Path path = meta.getFileStatus().getPath();
903+
for (DDBPathMetadata entry : sortedPaths) {
904+
Preconditions.checkArgument(entry != null);
905+
Path path = entry.getFileStatus().getPath();
905906
LOG.debug("Adding entry {}", path);
906907
if (path.isRoot()) {
907908
// this is a root entry: do not add it.
908909
break;
909910
}
910-
// create the new entry
911-
DDBPathMetadata entry = new DDBPathMetadata(meta);
912911
// add it to the ancestor state, failing if it is already there and
913912
// of a different type.
914913
DDBPathMetadata oldEntry = ancestorState.put(path, entry);
914+
boolean addAncestors = true;
915915
if (oldEntry != null) {
916916
if (!oldEntry.getFileStatus().isDirectory()
917917
|| !entry.getFileStatus().isDirectory()) {
@@ -928,12 +928,18 @@ private Collection<DDBPathMetadata> completeAncestry(
928928
// a directory is already present. Log and continue.
929929
LOG.debug("Directory at {} being updated with value {}",
930930
path, entry);
931+
// and we skip the the subsequent parent scan as we've already been
932+
// here
933+
addAncestors = false;
931934
}
932935
}
933936
// add the entry to the ancestry map as an explicitly requested entry.
934937
ancestry.put(path, Pair.of(EntryOrigin.Requested, entry));
938+
// now scan up the ancestor tree to see if there are any
939+
// immediately missing entries.
935940
Path parent = path.getParent();
936-
while (!parent.isRoot() && !ancestry.containsKey(parent)) {
941+
while (addAncestors
942+
&& !parent.isRoot() && !ancestry.containsKey(parent)) {
937943
if (!ancestorState.findEntry(parent, true)) {
938944
// there is no entry in the ancestor state.
939945
// look in the store
@@ -947,6 +953,9 @@ private Collection<DDBPathMetadata> completeAncestry(
947953
md = itemToPathMetadata(item, username);
948954
LOG.debug("Found existing entry for parent: {}", md);
949955
newEntry = Pair.of(EntryOrigin.Retrieved, md);
956+
// and we break, assuming that if there is an entry, its parents
957+
// are valid too.
958+
addAncestors = false;
950959
} else {
951960
// A directory entry was not found in the DB. Create one.
952961
LOG.debug("auto-create ancestor path {} for child path {}",
@@ -1439,13 +1448,15 @@ static S3AFileStatus makeDirStatus(Path f, String owner) {
14391448
* {@link #processBatchWriteRequest(DynamoDBMetadataStore.AncestorState, PrimaryKey[], Item[])}
14401449
* is only tried once.
14411450
* @param meta Directory listing metadata.
1451+
* @param unchangedEntries unchanged child entry paths
14421452
* @param operationState operational state for a bulk update
14431453
* @throws IOException IO problem
14441454
*/
14451455
@Override
14461456
@Retries.RetryTranslated
14471457
public void put(
14481458
final DirListingMetadata meta,
1459+
final List<Path> unchangedEntries,
14491460
@Nullable final BulkOperationState operationState) throws IOException {
14501461
LOG.debug("Saving {} dir meta for {} to table {} in region {}: {}",
14511462
meta.isAuthoritative() ? "auth" : "nonauth",
@@ -1463,8 +1474,14 @@ public void put(
14631474
final List<DDBPathMetadata> metasToPut = fullPathsToPut(ddbPathMeta,
14641475
ancestorState);
14651476

1466-
// next add all children of the directory
1467-
metasToPut.addAll(pathMetaToDDBPathMeta(meta.getListing()));
1477+
// next add all changed children of the directory
1478+
// ones that came from the previous listing are left as-is
1479+
final Collection<PathMetadata> children = meta.getListing()
1480+
.stream()
1481+
.filter(e -> !unchangedEntries.contains(e.getFileStatus().getPath()))
1482+
.collect(Collectors.toList());
1483+
1484+
metasToPut.addAll(pathMetaToDDBPathMeta(children));
14681485

14691486
// sort so highest-level entries are written to the store first.
14701487
// if a sequence fails, no orphan entries will have been written.

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import java.util.Collections;
4646
import java.util.HashMap;
4747
import java.util.LinkedList;
48+
import java.util.List;
4849
import java.util.Map;
4950
import java.util.concurrent.TimeUnit;
5051
import java.util.concurrent.atomic.AtomicLong;
@@ -341,13 +342,14 @@ public void put(PathMetadata meta,
341342

342343
@Override
343344
public synchronized void put(DirListingMetadata meta,
345+
final List<Path> unchangedEntries,
344346
final BulkOperationState operationState) throws IOException {
345347
if (LOG.isDebugEnabled()) {
346348
LOG.debug("put dirMeta {}", meta.prettyPrint());
347349
}
348350
LocalMetadataEntry entry =
349351
localCache.getIfPresent(standardize(meta.getPath()));
350-
if(entry == null){
352+
if (entry == null) {
351353
localCache.put(standardize(meta.getPath()), new LocalMetadataEntry(meta));
352354
} else {
353355
entry.setDirListingMetadata(meta);

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.io.Closeable;
2323
import java.io.IOException;
2424
import java.util.Collection;
25+
import java.util.List;
2526
import java.util.Map;
2627

2728
import com.google.common.annotations.VisibleForTesting;
@@ -265,11 +266,19 @@ void put(Collection<? extends PathMetadata> metas,
265266
* missing metadata updates (create, delete) made to the same path by
266267
* another process.
267268
*
269+
* To optimize updates and avoid overwriting existing entries which
270+
* may contain extra data, entries in the list of unchangedEntries may
271+
* be excluded. That is: the listing metadata has the full list of
272+
* what it believes are children, but implementations can opt to ignore
273+
* some.
268274
* @param meta Directory listing metadata.
275+
* @param unchangedEntries list of entries in the dir listing which have
276+
* not changed since the directory was list scanned on s3guard.
269277
* @param operationState operational state for a bulk update
270278
* @throws IOException if there is an error
271279
*/
272280
void put(DirListingMetadata meta,
281+
final List<Path> unchangedEntries,
273282
@Nullable BulkOperationState operationState) throws IOException;
274283

275284
/**

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.io.IOException;
3131
import java.util.Collection;
3232
import java.util.HashMap;
33+
import java.util.List;
3334
import java.util.Map;
3435

3536
/**
@@ -113,6 +114,7 @@ public void put(Collection<? extends PathMetadata> meta,
113114

114115
@Override
115116
public void put(DirListingMetadata meta,
117+
final List<Path> unchangedEntries,
116118
final BulkOperationState operationState) throws IOException {
117119
}
118120

0 commit comments

Comments
 (0)