diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md index 3b83bf5bc147..dec7bc36116b 100644 --- a/docs/sql-data-sources-jdbc.md +++ b/docs/sql-data-sources-jdbc.md @@ -103,7 +103,7 @@ logging into the data sources. (none) A prefix that will form the final query together with query. - As the specified query will be parenthesized as a subquery in the FROM clause and some databases do not + As the specified query will be parenthesized as a subquery in the FROM clause and some databases do not support all clauses in subqueries, the prepareQuery property offers a way to run such complex queries. As an example, spark will issue a query of the following form to the JDBC Source.

<prepareQuery> SELECT <columns> FROM (<user_specified_query>) spark_gen_alias

@@ -340,10 +340,19 @@ logging into the data sources. The name of the JDBC connection provider to use to connect to this URL, e.g. db2, mssql. Must be one of the providers loaded with the JDBC data source. Used to disambiguate when more than one provider can handle - the specified driver and options. The selected provider must not be disabled by spark.sql.sources.disabledJdbcConnProviderList. + the specified driver and options. The selected provider must not be disabled by spark.sql.sources.disabledJdbcConnProviderList. read/write - + + + inferTimestampNTZType + false + + When the option is set to true, all timestamps are inferred as TIMESTAMP WITHOUT TIME ZONE. + Otherwise, timestamps are read as TIMESTAMP with local time zone. + + read + Note that kerberos authentication with keytab is not always supported by the JDBC driver.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index df21a9820f9b..80675c7dc471 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -226,6 +226,9 @@ class JDBCOptions( // The prefix that is added to the query sent to the JDBC database. // This is required to support some complex queries with some JDBC databases. val prepareQuery = parameters.get(JDBC_PREPARE_QUERY).map(_ + " ").getOrElse("") + + // Infers timestamp values as TimestampNTZ type when reading data. + val inferTimestampNTZType = parameters.getOrElse(JDBC_INFER_TIMESTAMP_NTZ, "false").toBoolean } class JdbcOptionsInWrite( @@ -287,4 +290,5 @@ object JDBCOptions { val JDBC_REFRESH_KRB5_CONFIG = newOption("refreshKrb5Config") val JDBC_CONNECTION_PROVIDER = newOption("connectionProvider") val JDBC_PREPARE_QUERY = newOption("prepareQuery") + val JDBC_INFER_TIMESTAMP_NTZ = newOption("inferTimestampNTZType") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 27d2d9c84c34..e95fe280c760 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -67,7 +67,8 @@ object JDBCRDD extends Logging { statement.setQueryTimeout(options.queryTimeout) val rs = statement.executeQuery() try { - JdbcUtils.getSchema(rs, dialect, alwaysNullable = true) + JdbcUtils.getSchema(rs, dialect, alwaysNullable = true, + isTimestampNTZ = options.inferTimestampNTZType) } finally { rs.close() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 1f17d4f0b14c..cc8746ea5c40 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils, GenericArrayData} -import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp} +import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateTimeToMicros, localDateToDays, toJavaDate, toJavaTimestamp} import org.apache.spark.sql.connector.catalog.TableChange import org.apache.spark.sql.connector.catalog.index.{SupportsIndex, TableIndex} import org.apache.spark.sql.connector.expressions.NamedReference @@ -150,6 +150,10 @@ object JdbcUtils extends Logging with SQLConfHelper { case StringType => Option(JdbcType("TEXT", java.sql.Types.CLOB)) case BinaryType => Option(JdbcType("BLOB", java.sql.Types.BLOB)) case TimestampType => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP)) + // This is a common case of timestamp without time zone. Most of the databases either only + // support TIMESTAMP type or use TIMESTAMP as an alias for TIMESTAMP WITHOUT TIME ZONE. + // Note that some dialects override this setting, e.g. as SQL Server. + case TimestampNTZType => Option(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP)) case DateType => Option(JdbcType("DATE", java.sql.Types.DATE)) case t: DecimalType => Option( JdbcType(s"DECIMAL(${t.precision},${t.scale})", java.sql.Types.DECIMAL)) @@ -173,7 +177,8 @@ object JdbcUtils extends Logging with SQLConfHelper { sqlType: Int, precision: Int, scale: Int, - signed: Boolean): DataType = { + signed: Boolean, + isTimestampNTZ: Boolean): DataType = { val answer = sqlType match { // scalastyle:off case java.sql.Types.ARRAY => null @@ -215,6 +220,8 @@ object JdbcUtils extends Logging with SQLConfHelper { case java.sql.Types.TIME => TimestampType case java.sql.Types.TIME_WITH_TIMEZONE => null + case java.sql.Types.TIMESTAMP + if isTimestampNTZ => TimestampNTZType case java.sql.Types.TIMESTAMP => TimestampType case java.sql.Types.TIMESTAMP_WITH_TIMEZONE => null @@ -243,7 +250,8 @@ object JdbcUtils extends Logging with SQLConfHelper { conn.prepareStatement(options.prepareQuery + dialect.getSchemaQuery(options.tableOrQuery)) try { statement.setQueryTimeout(options.queryTimeout) - Some(getSchema(statement.executeQuery(), dialect)) + Some(getSchema(statement.executeQuery(), dialect, + isTimestampNTZ = options.inferTimestampNTZType)) } catch { case _: SQLException => None } finally { @@ -258,13 +266,15 @@ object JdbcUtils extends Logging with SQLConfHelper { * Takes a [[ResultSet]] and returns its Catalyst schema. * * @param alwaysNullable If true, all the columns are nullable. + * @param isTimestampNTZ If true, all timestamp columns are interpreted as TIMESTAMP_NTZ. * @return A [[StructType]] giving the Catalyst schema. * @throws SQLException if the schema contains an unsupported type. */ def getSchema( resultSet: ResultSet, dialect: JdbcDialect, - alwaysNullable: Boolean = false): StructType = { + alwaysNullable: Boolean = false, + isTimestampNTZ: Boolean = false): StructType = { val rsmd = resultSet.getMetaData val ncols = rsmd.getColumnCount val fields = new Array[StructField](ncols) @@ -306,7 +316,7 @@ object JdbcUtils extends Logging with SQLConfHelper { val columnType = dialect.getCatalystType(dataType, typeName, fieldSize, metadata).getOrElse( - getCatalystType(dataType, fieldSize, fieldScale, isSigned)) + getCatalystType(dataType, fieldSize, fieldScale, isSigned, isTimestampNTZ)) fields(i) = StructField(columnName, columnType, nullable, metadata.build()) i = i + 1 } @@ -463,7 +473,7 @@ object JdbcUtils extends Logging with SQLConfHelper { } } - case TimestampType => + case TimestampType | TimestampNTZType => (rs: ResultSet, row: InternalRow, pos: Int) => val t = rs.getTimestamp(pos + 1) if (t != null) { @@ -583,6 +593,18 @@ object JdbcUtils extends Logging with SQLConfHelper { stmt.setTimestamp(pos + 1, row.getAs[java.sql.Timestamp](pos)) } + case TimestampNTZType => + if (conf.datetimeJava8ApiEnabled) { + (stmt: PreparedStatement, row: Row, pos: Int) => + stmt.setTimestamp(pos + 1, toJavaTimestamp(instantToMicros(row.getAs[Instant](pos)))) + } else { + (stmt: PreparedStatement, row: Row, pos: Int) => + stmt.setTimestamp( + pos + 1, + toJavaTimestamp(localDateTimeToMicros(row.getAs[java.time.LocalDateTime](pos))) + ) + } + case DateType => if (conf.datetimeJava8ApiEnabled) { (stmt: PreparedStatement, row: Row, pos: Int) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala index a42129dbe8da..c95489a28761 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala @@ -98,6 +98,7 @@ private object MsSqlServerDialect extends JdbcDialect { override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case TimestampType => Some(JdbcType("DATETIME", java.sql.Types.TIMESTAMP)) + case TimestampNTZType => Some(JdbcType("DATETIME", java.sql.Types.TIMESTAMP)) case StringType => Some(JdbcType("NVARCHAR(MAX)", java.sql.Types.NVARCHAR)) case BooleanType => Some(JdbcType("BIT", java.sql.Types.BIT)) case BinaryType => Some(JdbcType("VARBINARY(MAX)", java.sql.Types.VARBINARY)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index c96b27ee7f3f..a07ef5ecd300 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.jdbc import java.math.BigDecimal import java.sql.{Date, DriverManager, SQLException, Timestamp} -import java.time.{Instant, LocalDate} +import java.time.{Instant, LocalDate, LocalDateTime} import java.util.{Calendar, GregorianCalendar, Properties, TimeZone} import scala.collection.JavaConverters._ @@ -1230,6 +1230,7 @@ class JDBCSuite extends QueryTest assert(getJdbcType(oracleDialect, BinaryType) == "BLOB") assert(getJdbcType(oracleDialect, DateType) == "DATE") assert(getJdbcType(oracleDialect, TimestampType) == "TIMESTAMP") + assert(getJdbcType(oracleDialect, TimestampNTZType) == "TIMESTAMP") } private def assertEmptyQuery(sqlString: String): Unit = { @@ -1879,5 +1880,53 @@ class JDBCSuite extends QueryTest val fields = schema.fields assert(fields.length === 1) assert(fields(0).dataType === StringType) - } + } + + test("SPARK-39339: Handle TimestampNTZType null values") { + val tableName = "timestamp_ntz_null_table" + + val df = Seq(null.asInstanceOf[LocalDateTime]).toDF("col1") + + df.write.format("jdbc") + .option("url", urlWithUserAndPass) + .option("dbtable", tableName).save() + + val res = spark.read.format("jdbc") + .option("inferTimestampNTZType", "true") + .option("url", urlWithUserAndPass) + .option("dbtable", tableName) + .load() + + checkAnswer(res, Seq(Row(null))) + } + + test("SPARK-39339: TimestampNTZType with different local time zones") { + val tableName = "timestamp_ntz_diff_tz_support_table" + + DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + DateTimeTestUtils.withDefaultTimeZone(zoneId) { + Seq( + "1972-07-04 03:30:00", + "2019-01-20 12:00:00.502", + "2019-01-20T00:00:00.123456", + "1500-01-20T00:00:00.123456" + ).foreach { case datetime => + val df = spark.sql(s"select timestamp_ntz '$datetime'") + df.write.format("jdbc") + .mode("overwrite") + .option("url", urlWithUserAndPass) + .option("dbtable", tableName) + .save() + + val res = spark.read.format("jdbc") + .option("inferTimestampNTZType", "true") + .option("url", urlWithUserAndPass) + .option("dbtable", tableName) + .load() + + checkAnswer(res, df) + } + } + } + } }