apache
diff --git a/‎connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 2 additions & 3 deletions b/‎connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala‎
Lines changed: 2 additions & 5 deletions b/‎connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala‎
Lines changed: 2 additions & 4 deletions b/‎connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala‎
Lines changed: 2 additions & 1 deletion b/‎connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/paths/SparkPath.scala‎
Lines changed: 55 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/paths/SparkPath.scala‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala‎
Lines changed: 3 additions & 3 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎scalastyle-config.xml‎
Lines changed: 8 additions & 0 deletions b/‎scalastyle-config.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala‎
Lines changed: 4 additions & 3 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala‎
Lines changed: 4 additions & 3 deletions
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.avro
 
 import java.io._
-import java.net.URI
 
 import scala.util.control.NonFatal
 
@@ -96,9 +95,9 @@ private[sql] class AvroFileFormat extends FileFormat
       // Doing input file filtering is improper because we may generate empty tasks that process no
       // input files but stress the scheduler. We should probably add a more general input file
       // filtering mechanism for `FileFormat` data sources. See SPARK-16317.
-      if (parsedOptions.ignoreExtension || file.filePath.endsWith(".avro")) {
+      if (parsedOptions.ignoreExtension || file.urlEncodedPath.endsWith(".avro")) {
         val reader = {
-          val in = new FsInput(new Path(new URI(file.filePath)), conf)
+          val in = new FsInput(file.toPath, conf)
           try {
             val datumReader = userProvidedSchema match {
               case Some(userSchema) => new GenericDatumReader[GenericRecord](userSchema)
 
@@ -16,14 +16,11 @@
  */
 package org.apache.spark.sql.v2.avro
 
-import java.net.URI
-
 import scala.util.control.NonFatal
 
 import org.apache.avro.file.DataFileReader
 import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
 import org.apache.avro.mapred.FsInput
-import org.apache.hadoop.fs.Path
 
 import org.apache.spark.TaskContext
 import org.apache.spark.broadcast.Broadcast
@@ -62,9 +59,9 @@ case class AvroPartitionReaderFactory(
     val conf = broadcastedConf.value.value
     val userProvidedSchema = options.schema
 
-    if (options.ignoreExtension || partitionedFile.filePath.endsWith(".avro")) {
+    if (options.ignoreExtension || partitionedFile.urlEncodedPath.endsWith(".avro")) {
       val reader = {
-        val in = new FsInput(new Path(new URI(partitionedFile.filePath)), conf)
+        val in = new FsInput(partitionedFile.toPath, conf)
         try {
           val datumReader = userProvidedSchema match {
             case Some(userSchema) => new GenericDatumReader[GenericRecord](userSchema)
 
@@ -18,13 +18,11 @@
 package org.apache.spark.sql.avro
 
 import java.io._
-import java.net.URI
 
 import org.apache.avro.file.DataFileReader
 import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
 import org.apache.avro.mapred.FsInput
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
@@ -62,8 +60,8 @@ class AvroRowReaderSuite
         case BatchScanExec(_, f: AvroScan, _, _, _, _, _) => f
       }
       val filePath = fileScan.get.fileIndex.inputFiles(0)
-      val fileSize = new File(new URI(filePath)).length
-      val in = new FsInput(new Path(new URI(filePath)), new Configuration())
+      val fileSize = new File(filePath.toUri).length
+      val in = new FsInput(filePath.toPath, new Configuration())
       val reader = DataFileReader.openReader(in, new GenericDatumReader[GenericRecord]())
 
       val it = new Iterator[InternalRow] with AvroUtils.RowReader {
 
@@ -2357,7 +2357,8 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper {
       assert(fileScan.get.dataFilters.nonEmpty)
       assert(fileScan.get.planInputPartitions().forall { partition =>
         partition.asInstanceOf[FilePartition].files.forall { file =>
-          file.filePath.contains("p1=1") && file.filePath.contains("p2=2")
+          file.urlEncodedPath.contains("p1=1") &&
+            file.urlEncodedPath.contains("p2=2")
         }
       })
       checkAnswer(df, Row("b", 1, 2))
 
@@ -47,7 +47,7 @@ private[spark] class WorkerWatcher(
   private[deploy] var isShutDown = false
 
   // Lets filter events only from the worker's rpc system
-  private val expectedAddress = RpcAddress.fromURIString(workerUrl)
+  private val expectedAddress = RpcAddress.fromUrlString(workerUrl)
   private def isWorker(address: RpcAddress) = expectedAddress == address
 
   private def exitNonZero() =
 
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.paths
+
+import java.net.URI
+
+import org.apache.hadoop.fs.{FileStatus, Path}
+
+/**
+ * A canonical representation of a file path. This class is intended to provide
+ * type-safety to the way that Spark handles Paths. Paths can be represented as
+ * Strings in multiple ways, which are not always compatible. Spark regularly uses
+ * two ways: 1. hadoop Path.toString and java URI.toString.
+ */
+case class SparkPath private (private val underlying: String) {
+  def urlEncoded: String = underlying
+  def toUri: URI = new URI(underlying)
+  def toPath: Path = new Path(toUri)
+  override def toString: String = underlying
+}
+
+object SparkPath {
+  /**
+   * Creates a SparkPath from a hadoop Path string.
+   * Please be very sure that the provided string is encoded (or not encoded) in the right way.
+   *
+   * Please see the hadoop Path documentation here:
+   * https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/Path.html#Path-java.lang.String-
+   */
+  def fromPathString(str: String): SparkPath = fromPath(new Path(str))
+  def fromPath(path: Path): SparkPath = fromUri(path.toUri)
+  def fromFileStatus(fs: FileStatus): SparkPath = fromPath(fs.getPath)
+
+  /**
+   * Creates a SparkPath from a url-encoded string.
+   * Note: It is the responsibility of the caller to ensure that str is a valid url-encoded string.
+   */
+  def fromUrlString(str: String): SparkPath = SparkPath(str)
+  def fromUri(uri: URI): SparkPath = fromUrlString(uri.toString)
+}
@@ -39,7 +39,7 @@ private[spark] case class RpcAddress(_host: String, port: Int) {
 private[spark] object RpcAddress {
 
   /** Return the [[RpcAddress]] represented by `uri`. */
-  def fromURIString(uri: String): RpcAddress = {
+  def fromUrlString(uri: String): RpcAddress = {
     val uriObj = new java.net.URI(uri)
     RpcAddress(uriObj.getHost, uriObj.getPort)
   }
 
@@ -19,7 +19,7 @@ package org.apache.spark.ml.source.image
 
 import com.google.common.io.{ByteStreams, Closeables}
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.FileStatus
 import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.ml.image.ImageSchema
@@ -71,8 +71,8 @@ private[image] class ImageFileFormat extends FileFormat with DataSourceRegister
       if (!imageSourceOptions.dropInvalid && requiredSchema.isEmpty) {
         Iterator(emptyUnsafeRow)
       } else {
-        val origin = file.filePath
-        val path = new Path(origin)
+        val origin = file.urlEncodedPath
+        val path = file.toPath
         val fs = path.getFileSystem(broadcastedHadoopConf.value.value)
         val stream = fs.open(path)
         val bytes = try {
 
@@ -437,4 +437,12 @@ This file is divided into 3 sections:
       Use org.apache.spark.util.Utils.createTempDir instead.
     </customMessage>
   </check>
+
+  <check customId="pathfromuri" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters><parameter name="regex">new Path\(new URI\(</parameter></parameters>
+    <customMessage><![CDATA[
+      Are you sure that this string is uri encoded? Please be careful when converting hadoop Paths
+      and URIs to and from String. If possible, please use SparkPath.
+    ]]></customMessage>
+  </check>
 </scalastyle>
@@ -34,6 +34,7 @@ import org.apache.spark.api.java.function._
 import org.apache.spark.api.python.{PythonRDD, SerDeUtil}
 import org.apache.spark.api.r.RRDD
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.paths.SparkPath
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QueryPlanningTracker, ScalaReflection, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
@@ -3924,18 +3925,18 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def inputFiles: Array[String] = {
-    val files: Seq[String] = queryExecution.optimizedPlan.collect {
+    val files: Seq[SparkPath] = queryExecution.optimizedPlan.collect {
       case LogicalRelation(fsBasedRelation: FileRelation, _, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
         fr.inputFiles
       case r: HiveTableRelation =>
-        r.tableMeta.storage.locationUri.map(_.toString).toArray
+        r.tableMeta.storage.locationUri.map(SparkPath.fromUri).toArray
       case DataSourceV2ScanRelation(DataSourceV2Relation(table: FileTable, _, _, _, _),
           _, _, _, _) =>
         table.fileIndex.inputFiles
     }.flatten
-    files.toSet.toArray
+    files.iterator.map(_.urlEncoded).toSet.toArray
   }
 
   /**
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ private[spark] case class RpcAddress(_host: String, port: Int) {`
`39`	`39`	`private[spark] object RpcAddress {`
`40`	`40`
`41`	`41`	/** Return the [[RpcAddress]] represented by `uri`. */
`42`		`- def fromURIString(uri: String): RpcAddress = {`
	`42`	`+ def fromUrlString(uri: String): RpcAddress = {`
`43`	`43`	`val uriObj = new java.net.URI(uri)`
`44`	`44`	`RpcAddress(uriObj.getHost, uriObj.getPort)`
`45`	`45`	`}`