diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 3bcdd70581..67761c0330 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -634,11 +634,56 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return struct_value[name].name(name) +def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: + if not isinstance(x, ibis_types.IntegerValue) and not isinstance( + x, ibis_types.FloatingValue + ): + raise TypeError("Non-numerical types are not supposed to reach this function.") + + if unit not in UNIT_TO_US_CONVERSION_FACTORS: + raise ValueError(f"Cannot convert input with unit '{unit}'.") + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] + x_converted = x_converted.cast(ibis_dtypes.int64) + + # Note: Due to an issue where casting directly to a timestamp + # without a timezone does not work, we first cast to UTC. This + # approach appears to bypass a potential bug in Ibis's cast function, + # allowing for subsequent casting to a timestamp type without timezone + # information. Further investigation is needed to confirm this behavior. + return x_converted.to_timestamp(unit="us").cast( + ibis_dtypes.Timestamp(timezone="UTC") + ) + + @scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type) if isinstance(x, ibis_types.NullScalar): return ibis_types.null().cast(to_type) + + # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: + x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC")) + return bigframes.dtypes.cast_ibis_value(x_converted, to_type) + + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting time64[us][pyarrow] to int64[pyarrow]. + return x.delta(ibis.time("00:00:00"), part="microsecond") + + if x.type() == ibis_dtypes.int64: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], + # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. + unit = "us" + x_converted = numeric_to_datatime(x, unit) + if to_type == ibis_dtypes.timestamp: + return x_converted.cast(ibis_dtypes.Timestamp()) + elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): + return x_converted + elif to_type == ibis_dtypes.time: + return x_converted.time() + return bigframes.dtypes.cast_ibis_value(x, to_type) @@ -677,19 +722,7 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): # The default unit is set to "ns" (nanoseconds) for consistency # with pandas, where "ns" is the default unit for datetime operations. unit = op.unit or "ns" - if unit not in UNIT_TO_US_CONVERSION_FACTORS: - raise ValueError(f"Cannot convert input with unit '{unit}'.") - x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] - x_converted = x_converted.cast(ibis_dtypes.int64) - - # Note: Due to an issue where casting directly to a timestamp - # without a timezone does not work, we first cast to UTC. This - # approach appears to bypass a potential bug in Ibis's cast function, - # allowing for subsequent casting to a timestamp type without timezone - # information. Further investigation is needed to confirm this behavior. - x = x_converted.to_timestamp(unit="us").cast( - ibis_dtypes.Timestamp(timezone="UTC") - ) + x = numeric_to_datatime(x, unit) return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 366820f9f6..d78a88dfeb 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -60,6 +60,7 @@ "boolean", "Float64", "Int64", + "int64[pyarrow]", "string", "string[pyarrow]", "timestamp[us, tz=UTC][pyarrow]", @@ -173,6 +174,9 @@ # "string" and "string[pyarrow]" are accepted BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") +# special case - both "Int64" and "int64[pyarrow]" are accepted +BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = pd.Int64Dtype() + # For the purposes of dataframe.memory_usage # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes DTYPE_BYTE_SIZES = { @@ -310,11 +314,12 @@ def bigframes_dtype_to_ibis_dtype( textwrap.dedent( f""" Unexpected data type {bigframes_dtype}. The following - str dtypes are supppted: 'boolean','Float64','Int64', 'string', - 'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]', - 'timestamp[us][pyarrow]','date32[day][pyarrow]', - 'time64[us][pyarrow]'. The following pandas.ExtensionDtype are - supported: pandas.BooleanDtype(), pandas.Float64Dtype(), + str dtypes are supppted: 'boolean','Float64','Int64', + 'int64[pyarrow]','string','string[pyarrow]', + 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', + 'date32[day][pyarrow]','time64[us][pyarrow]'. + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), pd.ArrowDtype(pa.timestamp("us")), @@ -434,6 +439,9 @@ def cast_ibis_value( ibis_dtypes.string, ibis_dtypes.Decimal(precision=38, scale=9), ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.time, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), ), ibis_dtypes.float64: ( ibis_dtypes.string, @@ -447,8 +455,15 @@ def cast_ibis_value( ibis_dtypes.Decimal(precision=38, scale=9), ibis_dtypes.Decimal(precision=76, scale=38), ibis_dtypes.binary, + ibis_dtypes.date, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.date: ( + ibis_dtypes.string, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), ), - ibis_dtypes.date: (ibis_dtypes.string,), ibis_dtypes.Decimal(precision=38, scale=9): ( ibis_dtypes.float64, ibis_dtypes.Decimal(precision=76, scale=38), @@ -457,9 +472,24 @@ def cast_ibis_value( ibis_dtypes.float64, ibis_dtypes.Decimal(precision=38, scale=9), ), - ibis_dtypes.time: (), - ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), - ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), + ibis_dtypes.time: ( + ibis_dtypes.int64, + ibis_dtypes.string, + ), + ibis_dtypes.timestamp: ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.Timestamp(timezone="UTC"): ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.timestamp, + ), ibis_dtypes.binary: (ibis_dtypes.string,), } diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f5c5b1c216..e22037a1ce 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2625,6 +2625,9 @@ def foo(x): ("int64_col", "boolean"), ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("int64_col", "time64[us][pyarrow]"), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), ("string_col", "binary[pyarrow]"), @@ -2633,9 +2636,17 @@ def foo(x): # raises a deprecation warning to use tz_localize/tz_convert instead, # but BigQuery always stores values as UTC and doesn't have to deal # with timezone conversions, so we'll allow it. + ("timestamp_col", "date32[day][pyarrow]"), + ("timestamp_col", "time64[us][pyarrow]"), ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), + ("datetime_col", "date32[day][pyarrow]"), + ("datetime_col", "string[pyarrow]"), + ("datetime_col", "time64[us][pyarrow]"), ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), ("date_col", "string[pyarrow]"), + ("date_col", pd.ArrowDtype(pa.timestamp("us"))), + ("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("time_col", "string[pyarrow]"), # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int # ("float64_col", "Int64"), # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend @@ -2653,6 +2664,24 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("timestamp_col", "int64[pyarrow]"), + ("datetime_col", "int64[pyarrow]"), + ("time_col", "int64[pyarrow]"), + ], +) +@skip_legacy_pandas +def test_date_time_astype_int( + scalars_df_index, scalars_pandas_df_index, column, to_type +): + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert bf_result.dtype == "Int64" + + def test_string_astype_int(): pd_series = pd.Series(["4", "-7", "0", " -03"]) bf_series = series.Series(pd_series) @@ -2676,6 +2705,75 @@ def test_string_astype_float(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +def test_string_astype_date(): + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( + pd.ArrowDtype(pa.string()) + ) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("date32[day][pyarrow]") + bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_datetime(): + pd_series = pd.Series( + ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) + bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_timestamp(): + pd_series = pd.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + bf_result = bf_series.astype( + pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_timestamp_astype_string(): + bf_series = series.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + + expected_result = pd.Series( + [ + "2014-08-15 08:15:12+00", + "2015-08-15 03:15:12.654754+00", + "2016-02-28 16:00:00+00", + ] + ) + bf_result = bf_series.astype(pa.string()).to_pandas() + + pd.testing.assert_series_equal( + bf_result, expected_result, check_index_type=False, check_dtype=False + ) + assert bf_result.dtype == "string[pyarrow]" + + @pytest.mark.parametrize( "index", [0, 5, -2],