[SPARK-43267][JDBC] Handle postgres unknown user-defined column as string in array

Hisoka-X · yaooqinn · commit 6f593beaf2f5 · 2023-06-02T10:02:13.000+08:00
### What changes were proposed in this pull request? Spark SQL now doesn’t support creating data frame from a Postgres table that contains user-defined array column. This PR support it as string. ### Why are the changes needed? Support handle user-defined array column in SPARK SQL with Postgres ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? 1. Add new test. 2. Tested in local. ```sql CREATE DOMAIN not_null_text AS TEXT DEFAULT ''; create table films ( code char(5 char) not null constraint firstkey primary key, title varchar(40 char) not null, did bigint not null, date_prod date, kind varchar(10 char), tz timestamp with time zone, int_arr integer[], column_name not_null_text[], column_name2 not_null_text ); INSERT INTO public.films (code, title, did, date_prod, kind, tz, int_arr, column_name, column_name2) VALUES (e'2 ', 'fdas', 1, '2023-04-07 16:05:48', '2', null, null, null, null); INSERT INTO public.films (code, title, did, date_prod, kind, tz, int_arr, column_name, column_name2) VALUES (e'4 ', 'fdsa', 1, '2023-04-07 16:05:48', '4', null, null, null, null); INSERT INTO public.films (code, title, did, date_prod, kind, tz, int_arr, column_name, column_name2) VALUES ('1 ', 'dafsdf', 1, '2023-04-04 14:43:51', '1', '2023-04-25 18:53:17.467000 +00:00', '{1,2,3}', '{1,fds,fdsa}', 'fdasfasdf'); ``` Test Case ```scala test("jdbc array") { val connectionProperties = new Properties() connectionProperties.put("user", "system") connectionProperties.put("password", "system") spark.read.jdbc( url = "jdbc:postgresql://localhost:54321/test?useSSL=false&serverTimezone=UTC" + "&useUnicode=true&characterEncoding=utf-8", table = "TEST.public.films", connectionProperties ).show() } ``` Result <img width="1444" alt="image" src="https://user-images.githubusercontent.com/32387433/234458027-e67e410b-c417-400d-be7e-431768afc0ef.png"> Closes #40953 from Hisoka-X/SPARK-43267_pg_array. Lead-authored-by: Jia Fan <fanjiaeminem@qq.com> Co-authored-by: Hisoka <fanjiaeminem@qq.com> Signed-off-by: Kent Yao <yao@apache.org>
diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
@@ -147,6 +147,13 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
       |('2013-04-05 12:01:02'),
       |('2013-04-05 18:01:02.123'),
       |('2013-04-05 18:01:02.123456')""".stripMargin).executeUpdate()
+
+    conn.prepareStatement("CREATE DOMAIN not_null_text AS TEXT DEFAULT ''").executeUpdate()
+    conn.prepareStatement("create table custom_type(type_array not_null_text[]," +
+      "type not_null_text)").executeUpdate()
+    conn.prepareStatement("INSERT INTO custom_type (type_array, type) VALUES" +
+      "('{1,fds,fdsa}','fdasfasdf')").executeUpdate()
+
   }
 
   test("Type mapping for various types") {
@@ -416,4 +423,13 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
     val df_actual = sqlContext.read.jdbc(jdbcUrl, "timestamp_ntz_roundtrip", prop)
     assert(df_actual.collect()(0) == df_expected.collect()(0))
   }
+
+  test("SPARK-43267: user-defined column in array test") {
+    val df = sqlContext.read.jdbc(jdbcUrl, "custom_type", new Properties)
+    val row = df.collect()
+    assert(row.length === 1)
+    assert(row(0).length === 2)
+    assert(row(0).getSeq[String](0) == Seq("1", "fds", "fdsa"))
+    assert(row(0).getString(1) == "fdasfasdf")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala
@@ -94,13 +94,15 @@ private object PostgresDialect extends JdbcDialect with SQLConfHelper {
     case "numeric" | "decimal" if precision > 0 => Some(DecimalType.bounded(precision, scale))
     case "numeric" | "decimal" =>
       // SPARK-26538: handle numeric without explicit precision and scale.
-      Some(DecimalType. SYSTEM_DEFAULT)
+      Some(DecimalType.SYSTEM_DEFAULT)
     case "money" =>
       // money[] type seems to be broken and difficult to handle.
       // So this method returns None for now.
       // See SPARK-34333 and https://github.com/pgjdbc/pgjdbc/issues/1405
       None
-    case _ => None
+    case _ =>
+      // SPARK-43267: handle unknown types in array as string, because there are user-defined types
+      Some(StringType)
   }
 
   override def convertJavaTimestampToTimestampNTZ(t: Timestamp): LocalDateTime = {