diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 322f0ce65c5..ac2a3db645c 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -173,6 +173,16 @@ class OrcStripeReader : public RecordBatchReader { int64_t batch_size_; }; +liborc::RowReaderOptions default_row_reader_options() { + liborc::RowReaderOptions options; + // Orc timestamp type is error-prone since it serializes values in the writer timezone + // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ + // writer and reader set the timezone to GMT by default to avoid any conversion. + // We follow the same practice here explicitly to make sure readers are aware of this. + options.setTimezoneName("GMT"); + return options; +} + } // namespace class ORCFileReader::Impl { @@ -320,25 +330,25 @@ class ORCFileReader::Impl { } Result> Read() { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema()); return ReadTable(opts, schema); } Result> Read(const std::shared_ptr& schema) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); return ReadTable(opts, schema); } Result> Read(const std::vector& include_indices) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); } Result> Read(const std::vector& include_names) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectNames(&opts, include_names)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); @@ -346,13 +356,13 @@ class ORCFileReader::Impl { Result> Read(const std::shared_ptr& schema, const std::vector& include_indices) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); return ReadTable(opts, schema); } Result> ReadStripe(int64_t stripe) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadBatch(opts, schema, stripes_[stripe].num_rows); @@ -360,7 +370,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_indices) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -369,7 +379,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_names) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); RETURN_NOT_OK(SelectNames(&opts, include_names)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -475,7 +485,7 @@ class ORCFileReader::Impl { return nullptr; } - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); if (!include_indices.empty()) { RETURN_NOT_OK(SelectIndices(&opts, include_indices)); } @@ -496,7 +506,7 @@ class ORCFileReader::Impl { Result> GetRecordBatchReader( int64_t batch_size, const std::vector& include_names) { - liborc::RowReaderOptions opts; + liborc::RowReaderOptions opts = default_row_reader_options(); if (!include_names.empty()) { RETURN_NOT_OK(SelectNames(&opts, include_names)); } @@ -696,6 +706,11 @@ Result MakeOrcWriterOptions( }); orc_options.setColumnsUseBloomFilter(std::move(orc_bloom_filter_columns)); orc_options.setBloomFilterFPP(options.bloom_filter_fpp); + // Orc timestamp type is error-prone since it serializes values in the writer timezone + // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ + // writer and reader set the timezone to GMT by default to avoid any conversion. + // We follow the same practice here explicitly to make sure readers are aware of this. + orc_options.setTimezoneName("GMT"); switch (options.compression) { case Compression::UNCOMPRESSED: orc_options.setCompression(liborc::CompressionKind::CompressionKind_NONE); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 4e79038308a..ff194e945d0 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -124,30 +124,25 @@ Result> GenerateRandomDate64Array(int64_t size, date64(), rand.Int64(size, kMilliMin, kMilliMax, null_probability)); } -Result> GenerateRandomTimestampArray(int64_t size, - TimeUnit::type type, - double null_probability) { +Result> GenerateRandomTimestampArray( + int64_t size, std::shared_ptr type, double null_probability) { random::RandomArrayGenerator rand(kRandomSeed); - switch (type) { + switch (type->unit()) { case TimeUnit::type::SECOND: { return CastInt64ArrayToTemporalArray( - timestamp(TimeUnit::SECOND), - rand.Int64(size, kSecondMin, kSecondMax, null_probability)); + type, rand.Int64(size, kSecondMin, kSecondMax, null_probability)); } case TimeUnit::type::MILLI: { return CastInt64ArrayToTemporalArray( - timestamp(TimeUnit::MILLI), - rand.Int64(size, kMilliMin, kMilliMax, null_probability)); + type, rand.Int64(size, kMilliMin, kMilliMax, null_probability)); } case TimeUnit::type::MICRO: { return CastInt64ArrayToTemporalArray( - timestamp(TimeUnit::MICRO), - rand.Int64(size, kMicroMin, kMicroMax, null_probability)); + type, rand.Int64(size, kMicroMin, kMicroMax, null_probability)); } case TimeUnit::type::NANO: { return CastInt64ArrayToTemporalArray( - timestamp(TimeUnit::NANO), - rand.Int64(size, kNanoMin, kNanoMax, null_probability)); + type, rand.Int64(size, kNanoMin, kNanoMax, null_probability)); } default: { return Status::TypeError("Unknown or unsupported Arrow TimeUnit: ", type); @@ -196,12 +191,11 @@ std::shared_ptr GenerateRandomChunkedArray( break; } case Type::TIMESTAMP: { - EXPECT_OK_AND_ASSIGN( - arrays[j], - GenerateRandomTimestampArray( - current_size_chunks[j], - internal::checked_pointer_cast(data_type)->unit(), - null_probability)); + EXPECT_OK_AND_ASSIGN(arrays[j], + GenerateRandomTimestampArray( + current_size_chunks[j], + internal::checked_pointer_cast(data_type), + null_probability)); break; } default: @@ -486,7 +480,8 @@ class TestORCWriterTrivialNoConversion : public ::testing::Test { field("int32", int32()), field("int64", int64()), field("float", float32()), field("double", float64()), field("decimal128nz", decimal128(25, 6)), field("decimal128z", decimal128(32, 0)), field("date32", date32()), - field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("ts3", timestamp(TimeUnit::NANO)), + field("ts4", timestamp(TimeUnit::NANO, "UTC")), field("string", utf8()), field("binary", binary()), field("struct", struct_({field("a", utf8()), field("b", int64())})), field("list", list(int32())), @@ -526,6 +521,7 @@ TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) { field("double", float64()), field("date32", date32()), field("ts3", timestamp(TimeUnit::NANO)), + field("ts4", timestamp(TimeUnit::NANO), "UTC"), field("string", utf8()), field("binary", binary()), }); diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index d58ccab0b77..6e9f9a288b6 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -381,6 +381,7 @@ Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, return AppendNumericBatchCast(batch, offset, length, builder); case liborc::TIMESTAMP: + case liborc::TIMESTAMP_INSTANT: return AppendTimestampBatch(batch, offset, length, builder); case liborc::DECIMAL: return AppendDecimalBatch(type, batch, offset, length, builder); @@ -995,8 +996,17 @@ Result> GetOrcType(const DataType& type) { case Type::type::DATE32: return liborc::createPrimitiveType(liborc::TypeKind::DATE); case Type::type::DATE64: - case Type::type::TIMESTAMP: return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP); + case Type::type::TIMESTAMP: { + const auto& timestamp_type = checked_cast(type); + if (!timestamp_type.timezone().empty()) { + // The timestamp values stored in the arrow array are normalized to UTC. + // TIMESTAMP_INSTANT type is always preferred over TIMESTAMP type. + return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP_INSTANT); + } + // The timestamp values stored in the arrow array can be in any timezone. + return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP); + } case Type::type::DECIMAL128: { const uint64_t precision = static_cast(checked_cast(type).precision()); @@ -1111,7 +1121,20 @@ Result> GetArrowType(const liborc::Type* type) { case liborc::CHAR: return fixed_size_binary(static_cast(type->getMaximumLength())); case liborc::TIMESTAMP: + // Values of TIMESTAMP type are stored in the writer timezone in the Orc file. + // Values are read back in the reader timezone. However, the writer timezone + // information in the Orc stripe footer is optional and may be missing. What is + // more, stripes in the same Orc file may have different writer timezones (though + // unlikely). So we cannot tell the exact timezone of values read back in the + // arrow::TimestampArray. In the adapter implementations, we set both writer and + // reader timezone to UTC to avoid any conversion so users can get the same values + // as written. To get rid of this burden, TIMESTAMP_INSTANT type is always preferred + // over TIMESTAMP type. return timestamp(TimeUnit::NANO); + case liborc::TIMESTAMP_INSTANT: + // Values of TIMESTAMP_INSTANT type are stored in the UTC timezone in the ORC file. + // Both read and write use the UTC timezone without any conversion. + return timestamp(TimeUnit::NANO, "UTC"); case liborc::DATE: return date32(); case liborc::DECIMAL: {