From f6eb4ada04e21760f805b3e50062f91a438cef56 Mon Sep 17 00:00:00 2001
From: sujith71955 <sujithchacko.2010@gmail.com>
Date: Fri, 24 Nov 2017 19:01:22 +0530
Subject: [PATCH] [SPARK-22601][SQL] Data load is getting displayed successful
 on providing non existing hdfs file path ## What changes were proposed in
 this pull request? When user tries to load data with a non existing hdfs file
 path system is not validating it and the load command operation is getting
 successful. This is misleading to the user. already there is a validation in
 the scenario of local file path. This PR has added validation in the scenario
 of hdfs file path ## How was this patch tested? existing tests are present to
 verify the impact and manually the scenario is been verified in hdfs cluster

---
 .../org/apache/spark/sql/execution/command/tables.scala  | 9 ++++++++-
 .../apache/spark/sql/hive/execution/HiveDDLSuite.scala   | 9 +++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index c9f6e571ddab3..c42e6c3257fad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -340,7 +340,7 @@ case class LoadDataCommand(
         uri
       } else {
         val uri = new URI(path)
-        if (uri.getScheme() != null && uri.getAuthority() != null) {
+        val hdfsUri = if (uri.getScheme() != null && uri.getAuthority() != null) {
           uri
         } else {
           // Follow Hive's behavior:
@@ -380,6 +380,13 @@ case class LoadDataCommand(
           }
           new URI(scheme, authority, absolutePath, uri.getQuery(), uri.getFragment())
         }
+        val hadoopConf = sparkSession.sessionState.newHadoopConf()
+        val srcPath = new Path(hdfsUri)
+        val fs = srcPath.getFileSystem(hadoopConf)
+        if (!fs.exists(srcPath)) {
+          throw new AnalysisException(s"LOAD DATA input path does not exist: $path")
+        }
+        hdfsUri
       }
 
     if (partition.nonEmpty) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 9063ef066aa84..6c11905ba8904 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -2141,4 +2141,13 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("load command for non local invalid path validation") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl(i INT, j STRING)")
+      val e = intercept[AnalysisException](
+        sql("load data inpath '/doesnotexist.csv' into table tbl"))
+      assert(e.message.contains("LOAD DATA input path does not exist"))
+    }
+  }
 }