HADOOP-17318: address feedback.

steveloughran · steveloughran · commit 9ba3f4e72209 · 2020-11-16T12:17:08.000Z
Also added config option fs.s3a.committer.uuid.source which is set in
the jobconf during job setup, used in test to verify source of ID.

Change-Id: I9eb44113bc6afd5826c8a51bdf16fb220f8fb111
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
@@ -131,7 +131,7 @@ protected WriteOperationHelper(S3AFileSystem owner, Configuration conf) {
    */
   void operationRetried(String text, Exception ex, int retries,
       boolean idempotent) {
-    LOG.info("{}: Retried {}: {}", retries, text, ex.toString());
+    LOG.info("{}: Retried {}: {}", text, retries, ex.toString());
     LOG.debug("Stack", ex);
     owner.operationRetried(text, ex, retries, idempotent);
   }
@@ -590,7 +590,7 @@ public BulkOperationState initiateOperation(final Path path,
   public UploadPartResult uploadPart(UploadPartRequest request)
       throws IOException {
     return retry("upload part #" + request.getPartNumber()
-        + " upload "+ request.getUploadId(),
+        + " upload ID "+ request.getUploadId(),
         request.getKey(),
         true,
         () -> owner.uploadPart(request));
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java
@@ -65,6 +65,7 @@
 import static org.apache.hadoop.fs.s3a.commit.CommitUtilsWithMR.*;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.E_NO_SPARK_UUID;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.FS_S3A_COMMITTER_UUID;
+import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.FS_S3A_COMMITTER_UUID_SOURCE;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.SPARK_WRITE_UUID;
 
 /**
@@ -94,14 +95,25 @@
  * committer was large enough for more all the parallel POST requests.
  */
 public abstract class AbstractS3ACommitter extends PathOutputCommitter {
+
   private static final Logger LOG =
       LoggerFactory.getLogger(AbstractS3ACommitter.class);
 
   public static final String THREAD_PREFIX = "s3a-committer-pool-";
 
   /**
-   * Unique ID for a Job. On Spark this MUST NOT be the YARN JobID;
-   * on MapReduce it MUST BE that.
+   * Error string when task setup fails.
+   */
+  @VisibleForTesting
+  public static final String E_SELF_GENERATED_JOB_UUID
+      = "has a self-generated job UUID";
+
+  /**
+   * Unique ID for a Job.
+   * In MapReduce Jobs the YARN JobID suffices.
+   * On Spark this only be the YARN JobID
+   * it is known to be creating strongly unique IDs
+   * (i.e. SPARK-33402 is on the branch).
    */
   private final String uuid;
 
@@ -175,17 +187,17 @@ protected AbstractS3ACommitter(
     setConf(context.getConfiguration());
     Pair<String, JobUUIDSource> id = buildJobUUID(
         conf, context.getJobID());
-    uuid = id.getLeft();
-    uuidSource = id.getRight();
+    this.uuid = id.getLeft();
+    this.uuidSource = id.getRight();
     LOG.info("Job UUID {} source {}", getUUID(), getUUIDSource().getText());
     initOutput(outputPath);
     LOG.debug("{} instantiated for job \"{}\" ID {} with destination {}",
         role, jobName(context), jobIdString(context), outputPath);
     S3AFileSystem fs = getDestS3AFS();
-    createJobMarker = context.getConfiguration().getBoolean(
+    this.createJobMarker = context.getConfiguration().getBoolean(
         CREATE_SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
         DEFAULT_CREATE_SUCCESSFUL_JOB_DIR_MARKER);
-    commitOperations = new CommitOperations(fs);
+    this.commitOperations = new CommitOperations(fs);
   }
 
   /**
@@ -483,11 +495,8 @@ public void setupJob(JobContext context) throws IOException {
       jobSetup = true;
       // patch job conf with the job UUID.
       Configuration c = context.getConfiguration();
-      c.set(FS_S3A_COMMITTER_UUID, this.getUUID());
-      if (getUUIDSource() == JobUUIDSource.GeneratedLocally) {
-        // we set the UUID up locally. Save it back to the job configuration
-        c.set(SPARK_WRITE_UUID, this.getUUID());
-      }
+      c.set(FS_S3A_COMMITTER_UUID, getUUID());
+      c.set(FS_S3A_COMMITTER_UUID_SOURCE, getUUIDSource().getText());
       Path dest = getOutputPath();
       if (createJobMarker){
         commitOperations.deleteSuccessMarker(dest);
@@ -517,7 +526,7 @@ && getUUIDSource() == JobUUIDSource.GeneratedLocally) {
         // generated locally.
         throw new PathCommitException(getOutputPath().toString(),
             "Task attempt " + attemptID
-                + " only has a self-generated job UUID");
+                + " " + E_SELF_GENERATED_JOB_UUID);
       }
       Path taskAttemptPath = getTaskAttemptPath(context);
       FileSystem fs = taskAttemptPath.getFileSystem(getConf());
@@ -1209,16 +1218,25 @@ protected void warnOnActiveUploads(final Path path) {
    * </p>
    * <p>
    * Spark will use a fake app ID based on the current time.
-   * This can lead to collisions on busy clusters.
-   *
+   * This can lead to collisions on busy clusters unless
+   * the specific spark release has SPARK-33402 applied.
+   * This appends a random long value to the timestamp, so
+   * is unique enough that the risk of collision is almost
+   * nonexistent.
+   * </p>
+   * <p>
+   *   The order of selection of a uuid is
    * </p>
    * <ol>
    *   <li>Value of
    *   {@link InternalCommitterConstants#FS_S3A_COMMITTER_UUID}.</li>
    *   <li>Value of
    *   {@link InternalCommitterConstants#SPARK_WRITE_UUID}.</li>
-   *   <li>If enabled: Self-generated uuid.</li>
-   *   <li>If not disabled: Application ID</li>
+   *   <li>If enabled through
+   *   {@link CommitConstants#FS_S3A_COMMITTER_GENERATE_UUID}:
+   *   Self-generated uuid.</li>
+   *   <li>If {@link CommitConstants#FS_S3A_COMMITTER_REQUIRE_UUID}
+   *   is not set: Application ID</li>
    * </ol>
    * The UUID bonding takes place during construction;
    * the staging committers use it to set up their wrapped
@@ -1263,16 +1281,18 @@ protected void warnOnActiveUploads(final Path path) {
 
     // Check the job hasn't declared a requirement for the UUID.
     // This allows or fail-fast validation of Spark behavior.
-    if (conf.getBoolean(FS_S3A_COMMITTER_REQUIRE_UUID, false)) {
+    if (conf.getBoolean(FS_S3A_COMMITTER_REQUIRE_UUID,
+        DEFAULT_S3A_COMMITTER_REQUIRE_UUID)) {
       throw new PathCommitException("", E_NO_SPARK_UUID);
     }
 
-    // see if the job can generate a random UUID
-    if (conf.getBoolean(FS_S3A_COMMITTER_GENERATE_UUID, false)) {
+    // see if the job can generate a random UUI`
+    if (conf.getBoolean(FS_S3A_COMMITTER_GENERATE_UUID,
+        DEFAULT_S3A_COMMITTER_GENERATE_UUID)) {
       // generate a random UUID. This is OK for a job, for a task
       // it means that the data may not get picked up.
       String newId = UUID.randomUUID().toString();
-      LOG.warn("No job ID in configuration; generating a randem ID: {}",
+      LOG.warn("No job ID in configuration; generating a random ID: {}",
           newId);
       return Pair.of(newId, JobUUIDSource.GeneratedLocally);
     }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/CommitConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/CommitConstants.java
@@ -294,6 +294,12 @@ private CommitConstants() {
   public static final String FS_S3A_COMMITTER_REQUIRE_UUID =
       "fs.s3a.committer.require.uuid";
 
+  /**
+   * Default value for {@link #FS_S3A_COMMITTER_REQUIRE_UUID}: {@value}.
+   */
+  public static final boolean DEFAULT_S3A_COMMITTER_REQUIRE_UUID =
+      false;
+
   /**
    * Generate a UUID in job setup rather than fall back to
    * YARN Application attempt ID.
@@ -304,4 +310,10 @@ private CommitConstants() {
   public static final String FS_S3A_COMMITTER_GENERATE_UUID =
       "fs.s3a.committer.generate.uuid";
 
+  /**
+   * Default value for {@link #FS_S3A_COMMITTER_GENERATE_UUID}: {@value}.
+   */
+  public static final boolean DEFAULT_S3A_COMMITTER_GENERATE_UUID =
+      false;
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/InternalCommitterConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/InternalCommitterConstants.java
@@ -49,6 +49,12 @@ private InternalCommitterConstants() {
   public static final String FS_S3A_COMMITTER_UUID =
       "fs.s3a.committer.uuid";
 
+  /**
+   * Where did the UUID come from? {@value}.
+   */
+  public static final String FS_S3A_COMMITTER_UUID_SOURCE =
+      "fs.s3a.committer.uuid.source";
+
   /**
    * Directory committer factory: {@value}.
    */
@@ -105,11 +111,6 @@ private InternalCommitterConstants() {
   public static final String SPARK_WRITE_UUID =
       "spark.sql.sources.writeJobUUID";
 
-  /**
-   * The App ID for jobs: {@value}.
-   */
-  public static final String SPARK_APP_ID = "spark.app.id";
-
   /**
    * Java temp dir: {@value}.
    */
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java
@@ -117,7 +117,7 @@ public StagingCommitter(Path outputPath,
     this.uniqueFilenames = conf.getBoolean(
         FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES,
         DEFAULT_STAGING_COMMITTER_UNIQUE_FILENAMES);
-    setWorkPath(buildWorkPath(context, this.getUUID()));
+    setWorkPath(buildWorkPath(context, getUUID()));
     this.wrappedCommitter = createWrappedCommitter(context, conf);
     setOutputPath(constructorOutputPath);
     Path finalOutputPath = getOutputPath();
@@ -174,7 +174,7 @@ public String toString() {
     sb.append(", commitsDirectory=").append(commitsDirectory);
     sb.append(", uniqueFilenames=").append(uniqueFilenames);
     sb.append(", conflictResolution=").append(conflictResolution);
-    sb.append(". uploadPartSize=").append(uploadPartSize);
+    sb.append(", uploadPartSize=").append(uploadPartSize);
     if (wrappedCommitter != null) {
       sb.append(", wrappedCommitter=").append(wrappedCommitter);
     }
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
@@ -530,30 +530,29 @@ Amazon S3, that means S3Guard must *always* be enabled.
 
 Conflict management is left to the execution engine itself.
 
-## Committer Configuration Options
+## Common Committer Options
 
 
-| Option | Magic | Directory | Partitioned | Meaning | Default |
-|--------|-------|-----------|-------------|---------|---------|
-| `mapreduce.fileoutputcommitter.marksuccessfuljobs` | X | X | X | Write a `_SUCCESS` file on the successful completion of the job. | `true` |
-| `fs.s3a.buffer.dir` | X | X | X | Local filesystem directory for data being written and/or staged. | `${hadoop.tmp.dir}/s3a` |
-| `fs.s3a.committer.magic.enabled` | X |  | | Enable "magic committer" support in the filesystem. | `false` |
-| `fs.s3a.committer.abort.pending.uploads` | X | X | X | list and abort all pending uploads under the destination path when the job is committed or aborted. | `true` |
-| `fs.s3a.committer.threads` | X | X | X | Number of threads in committers for parallel operations on files. | 8 |
-| `fs.s3a.committer.generate.uuid` |  | X | X | Generate a Job UUID if none is passed down from Spark | `false` |
-| `fs.s3a.committer.require.uuid` |  | X | X | Require the Job UUID to be passed down from Spark | `false` |
+| Option | Meaning | Default |
+|--------|---------|---------|
+| `mapreduce.fileoutputcommitter.marksuccessfuljobs` | Write a `_SUCCESS` file on the successful completion of the job. | `true` |
+| `fs.s3a.buffer.dir` | Local filesystem directory for data being written and/or staged. | `${hadoop.tmp.dir}/s3a` |
+| `fs.s3a.committer.magic.enabled` | Enable "magic committer" support in the filesystem. | `false` |
+| `fs.s3a.committer.abort.pending.uploads` | list and abort all pending uploads under the destination path when the job is committed or aborted. | `true` |
+| `fs.s3a.committer.threads` | Number of threads in committers for parallel operations on files. | 8 |
+| `fs.s3a.committer.generate.uuid` | Generate a Job UUID if none is passed down from Spark | `false` |
+| `fs.s3a.committer.require.uuid` |Require the Job UUID to be passed down from Spark | `false` |
 
 
-Staging committer (Directory and Partitioned) options
+## Staging committer (Directory and Partitioned) options
 
 
-| Option | Magic | Directory | Partitioned | Meaning | Default |
-|--------|-------|-----------|-------------|---------|---------|
-
-| `fs.s3a.committer.staging.conflict-mode` |  | X | X | Conflict resolution: `fail`, `append` or `replace`| `append` |
-| `fs.s3a.committer.staging.tmp.path` |  | X | X | Path in the cluster filesystem for temporary data. | `tmp/staging` |
-| `fs.s3a.committer.staging.unique-filenames` |  | X | X | Generate unique filenames. | `true` |
-| `fs.s3a.committer.staging.abort.pending.uploads` |  | X | X | Deprecated; replaced by `fs.s3a.committer.abort.pending.uploads`. |  |
+| Option | Meaning | Default |
+|--------|---------|---------|
+| `fs.s3a.committer.staging.conflict-mode` | Conflict resolution: `fail`, `append` or `replace`| `append` |
+| `fs.s3a.committer.staging.tmp.path` | Path in the cluster filesystem for temporary data. | `tmp/staging` |
+| `fs.s3a.committer.staging.unique-filenames` | Generate unique filenames. | `true` |
+| `fs.s3a.committer.staging.abort.pending.uploads` | Deprecated; replaced by `fs.s3a.committer.abort.pending.uploads`. |  `(false)` |
 
 
 ### Common Committer Options
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
@@ -104,6 +104,7 @@ Fixes:
 revert to the JVM SSL implementation when the wildfly
 or native openssl libraries cannot be loaded.
 
+
 ## <a name="authentication"></a> Authentication Failure
 
 If Hadoop cannot authenticate with the S3 service endpoint,
@@ -286,7 +287,17 @@ There's two main causes
    classloader, so the JVM does not consider it to be an implementation.
    Fix: learn the entire JVM classloader model and see if you can then debug it.
    Tip: having both the AWS Shaded SDK and individual AWS SDK modules on your classpath
-   may be a cause of this
+   may be a cause of this.
+
+If you see this and you are trying to use the S3A connector with Spark, then the cause can
+be that the isolated classloader used to load Hive classes is interfering with the S3A
+connector's dynamic loading of `com.amazonaws` classes. To fix this, declare that that
+the classes in the aws SDK are loaded from the same classloader which instantiated
+the S3A FileSystem instance:
+
+```
+spark.sql.hive.metastore.sharedPrefixes com.amazonaws.
+```
 
 ## <a name="access_denied"></a> "The security token included in the request is invalid"
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java
@@ -71,9 +71,11 @@
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
+import static org.apache.hadoop.fs.s3a.commit.AbstractS3ACommitter.E_SELF_GENERATED_JOB_UUID;
 import static org.apache.hadoop.fs.s3a.commit.CommitConstants.*;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.E_NO_SPARK_UUID;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.FS_S3A_COMMITTER_UUID;
+import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.FS_S3A_COMMITTER_UUID_SOURCE;
 import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.SPARK_WRITE_UUID;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 
@@ -1499,16 +1501,16 @@ public void testParallelJobsToSameDestination() throws Throwable {
     Job job2 = newJob(outDir,
         c2,
         attempt2);
-    Configuration conf2 = job2.getConfiguration();
-    conf2.set("mapreduce.output.basename", "task2");
+    Configuration jobConf2 = job2.getConfiguration();
+    jobConf2.set("mapreduce.output.basename", "task2");
     String stage2Id = UUID.randomUUID().toString();
-    conf2.set(SPARK_WRITE_UUID,
+    jobConf2.set(SPARK_WRITE_UUID,
         stage2Id);
 
-    JobContext jContext2 = new JobContextImpl(conf2,
+    JobContext jContext2 = new JobContextImpl(jobConf2,
         taskAttempt2.getJobID());
     TaskAttemptContext tContext2 =
-        new TaskAttemptContextImpl(conf2, taskAttempt2);
+        new TaskAttemptContextImpl(jobConf2, taskAttempt2);
     AbstractS3ACommitter committer2 = createCommitter(outDir, tContext2);
     Assertions.assertThat(committer2.getJobAttemptPath(jContext2))
         .describedAs("Job attempt path of %s", committer2)
@@ -1548,7 +1550,7 @@ public void testParallelJobsToSameDestination() throws Throwable {
     if (multipartInitiatedInWrite) {
       // magic committer runs -commit job1 while a job2 TA has an open
       // writer (and hence: open MP Upload)
-      LOG.info("Commit Job 1");
+      LOG.info("With Multipart Initiated In Write: Commit Job 1");
       commitJob(committer1, jContext1);
     }
 
@@ -1567,7 +1569,7 @@ public void testParallelJobsToSameDestination() throws Throwable {
     if (!multipartInitiatedInWrite) {
       // if not a magic committer, commit the job now. Because at
       // this point the staging committer tasks from job2 will be pending
-      LOG.info("Commit Job 1");
+      LOG.info("With Multipart NOT Initiated In Write: Commit Job 1");
       assertJobAttemptPathExists(committer1, jContext1);
       commitJob(committer1, jContext1);
     }
@@ -1621,13 +1623,14 @@ public void testSelfGeneratedUUID() throws Throwable {
         .describedAs("UUID source of %s", committer)
         .isEqualTo(AbstractS3ACommitter.JobUUIDSource.GeneratedLocally);
 
+    // examine the job configuration and verify that it has been updated
     Configuration jobConf = jobData.conf;
     Assertions.assertThat(jobConf.get(FS_S3A_COMMITTER_UUID, null))
         .describedAs("Config option " + FS_S3A_COMMITTER_UUID)
         .isEqualTo(uuid);
-    Assertions.assertThat(jobConf.get(SPARK_WRITE_UUID, null))
-        .describedAs("Config option " + SPARK_WRITE_UUID)
-        .isEqualTo(uuid);
+    Assertions.assertThat(jobConf.get(FS_S3A_COMMITTER_UUID_SOURCE, null))
+        .describedAs("Config option " + FS_S3A_COMMITTER_UUID_SOURCE)
+        .isEqualTo(AbstractS3ACommitter.JobUUIDSource.GeneratedLocally.getText());
 
     // because the task was set up in the job, it can have task
     // setup called, even though it had a random ID.
@@ -1643,7 +1646,9 @@ public void testSelfGeneratedUUID() throws Throwable {
     assertNotEquals("job UUIDs",
         committer.getUUID(),
         committer2.getUUID());
-    intercept(PathCommitException.class, () -> {
+    // Task setup MUST fail.
+    intercept(PathCommitException.class,
+        E_SELF_GENERATED_JOB_UUID, () -> {
       committer2.setupTask(tContext2);
       return committer2;
     });
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java
@@ -35,7 +35,6 @@
 import com.amazonaws.services.s3.model.AbortMultipartUploadRequest;
 import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
 
-import org.apache.hadoop.fs.s3a.commit.PathCommitException;
 import org.apache.hadoop.thirdparty.com.google.common.collect.Sets;
 import org.assertj.core.api.Assertions;
 import org.hamcrest.core.StringStartsWith;
@@ -57,6 +56,7 @@
 import org.apache.hadoop.fs.s3a.MockS3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.commit.AbstractS3ACommitter;
+import org.apache.hadoop.fs.s3a.commit.PathCommitException;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
 import org.apache.hadoop.fs.s3a.commit.files.SinglePendingCommit;
 import org.apache.hadoop.mapred.JobConf;