From bc6f172e544a06423b868bb16f1d5d6b806bbe2b Mon Sep 17 00:00:00 2001 From: Lily Zhang Date: Thu, 14 Mar 2024 18:15:46 +0000 Subject: [PATCH 1/4] feat: support datetime related casting in (Series|DataFrame|Index).astype --- bigframes/core/compile/scalar_op_compiler.py | 58 +++++++++--- bigframes/dtypes.py | 48 ++++++++-- tests/system/small/test_series.py | 96 ++++++++++++++++++++ 3 files changed, 180 insertions(+), 22 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 3bcdd70581..6e5185dd5d 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -634,11 +634,55 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return struct_value[name].name(name) +def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: + if not isinstance(x, ibis_types.IntegerValue) and not isinstance( + x, ibis_types.FloatingValue + ): + raise TypeError("Non-numerical types are not supposed to reach this function.") + + if unit not in UNIT_TO_US_CONVERSION_FACTORS: + raise ValueError(f"Cannot convert input with unit '{unit}'.") + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] + x_converted = x_converted.cast(ibis_dtypes.int64) + + # Note: Due to an issue where casting directly to a timestamp + # without a timezone does not work, we first cast to UTC. This + # approach appears to bypass a potential bug in Ibis's cast function, + # allowing for subsequent casting to a timestamp type without timezone + # information. Further investigation is needed to confirm this behavior. + return x_converted.to_timestamp(unit="us").cast( + ibis_dtypes.Timestamp(timezone="UTC") + ) + + @scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type) if isinstance(x, ibis_types.NullScalar): return ibis_types.null().cast(to_type) + + # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: + x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC")) + return bigframes.dtypes.cast_ibis_value(x_converted, to_type) + + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting time64[us][pyarrow] to int64[pyarrow]. + return x.delta(ibis.time("00:00:00"), part="microsecond") + + if x.type() == ibis_dtypes.int64: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting timestamp[us][pyarrow] to int64[pyarrow]. + unit = "us" + x_converted = numeric_to_datatime(x, unit) + if to_type == ibis_dtypes.timestamp: + return x_converted.cast(ibis_dtypes.Timestamp()) + elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): + return x_converted + elif to_type == ibis_dtypes.time: + return x_converted.time() + return bigframes.dtypes.cast_ibis_value(x, to_type) @@ -677,19 +721,7 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): # The default unit is set to "ns" (nanoseconds) for consistency # with pandas, where "ns" is the default unit for datetime operations. unit = op.unit or "ns" - if unit not in UNIT_TO_US_CONVERSION_FACTORS: - raise ValueError(f"Cannot convert input with unit '{unit}'.") - x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] - x_converted = x_converted.cast(ibis_dtypes.int64) - - # Note: Due to an issue where casting directly to a timestamp - # without a timezone does not work, we first cast to UTC. This - # approach appears to bypass a potential bug in Ibis's cast function, - # allowing for subsequent casting to a timestamp type without timezone - # information. Further investigation is needed to confirm this behavior. - x = x_converted.to_timestamp(unit="us").cast( - ibis_dtypes.Timestamp(timezone="UTC") - ) + x = numeric_to_datatime(x, unit) return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 366820f9f6..d78a88dfeb 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -60,6 +60,7 @@ "boolean", "Float64", "Int64", + "int64[pyarrow]", "string", "string[pyarrow]", "timestamp[us, tz=UTC][pyarrow]", @@ -173,6 +174,9 @@ # "string" and "string[pyarrow]" are accepted BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") +# special case - both "Int64" and "int64[pyarrow]" are accepted +BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = pd.Int64Dtype() + # For the purposes of dataframe.memory_usage # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes DTYPE_BYTE_SIZES = { @@ -310,11 +314,12 @@ def bigframes_dtype_to_ibis_dtype( textwrap.dedent( f""" Unexpected data type {bigframes_dtype}. The following - str dtypes are supppted: 'boolean','Float64','Int64', 'string', - 'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]', - 'timestamp[us][pyarrow]','date32[day][pyarrow]', - 'time64[us][pyarrow]'. The following pandas.ExtensionDtype are - supported: pandas.BooleanDtype(), pandas.Float64Dtype(), + str dtypes are supppted: 'boolean','Float64','Int64', + 'int64[pyarrow]','string','string[pyarrow]', + 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', + 'date32[day][pyarrow]','time64[us][pyarrow]'. + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), pd.ArrowDtype(pa.timestamp("us")), @@ -434,6 +439,9 @@ def cast_ibis_value( ibis_dtypes.string, ibis_dtypes.Decimal(precision=38, scale=9), ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.time, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), ), ibis_dtypes.float64: ( ibis_dtypes.string, @@ -447,8 +455,15 @@ def cast_ibis_value( ibis_dtypes.Decimal(precision=38, scale=9), ibis_dtypes.Decimal(precision=76, scale=38), ibis_dtypes.binary, + ibis_dtypes.date, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.date: ( + ibis_dtypes.string, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), ), - ibis_dtypes.date: (ibis_dtypes.string,), ibis_dtypes.Decimal(precision=38, scale=9): ( ibis_dtypes.float64, ibis_dtypes.Decimal(precision=76, scale=38), @@ -457,9 +472,24 @@ def cast_ibis_value( ibis_dtypes.float64, ibis_dtypes.Decimal(precision=38, scale=9), ), - ibis_dtypes.time: (), - ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), - ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), + ibis_dtypes.time: ( + ibis_dtypes.int64, + ibis_dtypes.string, + ), + ibis_dtypes.timestamp: ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.Timestamp(timezone="UTC"): ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.timestamp, + ), ibis_dtypes.binary: (ibis_dtypes.string,), } diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 37b4f8c1de..055a41a7da 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2625,6 +2625,9 @@ def foo(x): ("int64_col", "boolean"), ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us"))), + ("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("int64_col", "time64[us][pyarrow]"), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), ("string_col", "binary[pyarrow]"), @@ -2633,9 +2636,17 @@ def foo(x): # raises a deprecation warning to use tz_localize/tz_convert instead, # but BigQuery always stores values as UTC and doesn't have to deal # with timezone conversions, so we'll allow it. + ("timestamp_col", "date32[day][pyarrow]"), + ("timestamp_col", "time64[us][pyarrow]"), ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), + ("datetime_col", "date32[day][pyarrow]"), + ("datetime_col", "string[pyarrow]"), + ("datetime_col", "time64[us][pyarrow]"), ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), ("date_col", "string[pyarrow]"), + ("date_col", pd.ArrowDtype(pa.timestamp("us"))), + ("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("time_col", "string[pyarrow]"), # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int # ("float64_col", "Int64"), # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend @@ -2653,6 +2664,23 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("column", "to_type"), + [ + ("timestamp_col", "int64[pyarrow]"), + ("datetime_col", "int64[pyarrow]"), + ("time_col", "int64[pyarrow]"), + ], +) +@skip_legacy_pandas +def test_date_time_astype_int( + scalars_df_index, scalars_pandas_df_index, column, to_type +): + bf_result = scalars_df_index[column].astype(to_type).to_pandas() + pd_result = scalars_pandas_df_index[column].astype(to_type) + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + def test_string_astype_int(): pd_series = pd.Series(["4", "-7", "0", " -03"]) bf_series = series.Series(pd_series) @@ -2676,6 +2704,74 @@ def test_string_astype_float(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) +def test_string_astype_date(): + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( + pd.ArrowDtype(pa.string()) + ) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("date32[day][pyarrow]") + bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_datetime(): + pd_series = pd.Series( + ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) + bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_timestamp(): + pd_series = pd.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.string())) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + bf_result = bf_series.astype( + pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ).to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_timestamp_astype_string(): + bf_series = series.Series( + [ + "2014-08-15 08:15:12+00:00", + "2015-08-15 08:15:12.654754+05:00", + "2016-02-29 00:00:00+08:00", + ] + ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) + + expected_result = pd.Series( + [ + "2014-08-15 08:15:12+00", + "2015-08-15 03:15:12.654754+00", + "2016-02-28 16:00:00+00", + ] + ) + bf_result = bf_series.astype(pa.string()).to_pandas() + + pd.testing.assert_series_equal( + bf_result, expected_result, check_index_type=False, check_dtype=False + ) + + @pytest.mark.parametrize( "index", [0, 5, -2], From 11b4a0cf8014330813b0341096dd3cf5ac958086 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:22:52 -0700 Subject: [PATCH 2/4] chore: add deferred exec code samples (#439) * chore: add deferred exec code samples * fix tests * fix tests --- bigframes/_config/compute_options.py | 11 ++++++++++ .../pandas/core/config_init.py | 20 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 20c31d3906..fb708b844c 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -23,6 +23,17 @@ class ComputeOptions: """ Encapsulates configuration for compute options. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + >>> bpd.options.compute.maximum_bytes_billed = 500 + >>> # df.to_pandas() # this should fail + google.api_core.exceptions.InternalServerError: 500 Query exceeded limit for bytes billed: 500. 10485760 or higher required. + + >>> bpd.options.compute.maximum_bytes_billed = None # reset option + Attributes: maximum_bytes_billed (int, Options): Limits the bytes billed for query jobs. Queries that will have diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index dfb91dfeb8..33c6b3e093 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -15,6 +15,26 @@ display_options_doc = """ Encapsulates configuration for displaying objects. +**Examples:** + +Define Repr mode to "deferred" will prevent job execution in repr. + >>> import bigframes.pandas as bpd + >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + >>> bpd.options.display.repr_mode = "deferred" + >>> df.head(20) # will no longer run the job + Computation deferred. Computation will process 28.9 kB + +Users can also get a dry run of the job by accessing the query_job property before they've run the job. This will return a dry run instance of the job they can inspect. + >>> df.query_job.total_bytes_processed + 28947 + +User can execute the job by calling .to_pandas() + >>> # df.to_pandas() + +Reset option + >>> bpd.options.display.repr_mode = "head" + Attributes: max_columns (int, default 20): If `max_columns` is exceeded, switch to truncate view. From 79cb05ae44e875d2e2694a93a10580ef742d41df Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 14 Mar 2024 15:14:29 -0700 Subject: [PATCH 3/4] feat: add DataFrame.pipe() method (#421) --- tests/system/small/test_dataframe.py | 25 +++++ tests/system/small/test_series.py | 25 +++++ .../bigframes_vendored/pandas/core/common.py | 42 +++++++ .../bigframes_vendored/pandas/core/generic.py | 105 +++++++++++++++++- 4 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 third_party/bigframes_vendored/pandas/core/common.py diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 61dcd778ef..be4211a2fc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1000,6 +1000,31 @@ def test_apply_series_scalar_callable( pandas.testing.assert_series_equal(bf_result, pd_result) +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_keys( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 055a41a7da..3627e8249c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3299,3 +3299,28 @@ def test_apply_not_supported(scalars_dfs, col, lambda_, exception): bf_col = scalars_df[col] with pytest.raises(exception): bf_col.apply(lambda_, by_row=False) + + +def test_series_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + column = "int64_too" + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[column] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + assert_series_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py new file mode 100644 index 0000000000..ded5a22b8f --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -0,0 +1,42 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/common.py +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from bigframes_vendored.pandas.pandas._typing import T + + +def pipe( + obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs +) -> T: + """ + Apply a function ``func`` to object ``obj`` either by passing obj as the + first argument to the function or, in the case that the func is a tuple, + interpret the first element of the tuple as a function and pass the obj to + that function as a keyword argument whose key is the value of the second + element of the tuple. + + Args: + func (callable or tuple of (callable, str)): + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of ``callable`` that expects the + object. + args (iterable, optional): + Positional arguments passed into ``func``. + kwargs (dict, optional): + A dictionary of keyword arguments passed into ``func``. + + Returns: + object: the return type of ``func``. + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = f"{target} is both the pipe target and a keyword argument" + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 01d8f7a174..7f8e1f7b53 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1,12 +1,16 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py from __future__ import annotations -from typing import Iterator, Literal, Optional +from typing import Callable, Iterator, Literal, Optional, TYPE_CHECKING from bigframes_vendored.pandas.core import indexing +import bigframes_vendored.pandas.core.common as common from bigframes import constants +if TYPE_CHECKING: + from bigframes_vendored.pandas.pandas._typing import T + class NDFrame(indexing.IndexingMixin): """ @@ -963,6 +967,105 @@ def expanding(self, min_periods=1): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + """ + Apply chainable functions that expect Series or DataFrames. + + **Examples:** + + Constructing a income DataFrame from a dictionary. + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] + >>> df = bpd.DataFrame(data, columns=['Salary', 'Others']) + >>> df + Salary Others + 0 8000 1000.0 + 1 9500 + 2 5000 2000.0 + + [3 rows x 2 columns] + + Functions that perform tax reductions on an income DataFrame. + + >>> def subtract_federal_tax(df): + ... return df * 0.9 + >>> def subtract_state_tax(df, rate): + ... return df * (1 - rate) + >>> def subtract_national_insurance(df, rate, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + + Instead of writing + + >>> subtract_national_insurance( + ... subtract_state_tax(subtract_federal_tax(df), rate=0.12), + ... rate=0.05, + ... rate_increase=0.02) # doctest: +SKIP + + You can write + + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 + 2 3682.8 1473.12 + + [3 rows x 2 columns] + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``national_insurance`` takes its data as ``df`` + in the second argument: + + >>> def subtract_national_insurance(rate, df, rate_increase): + ... new_rate = rate + rate_increase + ... return df * (1 - new_rate) + >>> ( + ... df.pipe(subtract_federal_tax) + ... .pipe(subtract_state_tax, rate=0.12) + ... .pipe( + ... (subtract_national_insurance, 'df'), + ... rate=0.05, + ... rate_increase=0.02 + ... ) + ... ) + Salary Others + 0 5892.48 736.56 + 1 6997.32 + 2 3682.8 1473.12 + + [3 rows x 2 columns] + + Args: + func (function): + Function to apply to this object. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects this object. + args (iterable, optional): + Positional arguments passed into ``func``. + kwargs (mapping, optional): + A dictionary of keyword arguments passed into ``func``. + + Returns: + same type as caller + """ + return common.pipe(self, func, *args, **kwargs) + def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " From efbf1f28c39ee2f2e327938c8a2d3095231eee31 Mon Sep 17 00:00:00 2001 From: Lily Zhang Date: Fri, 15 Mar 2024 00:32:23 +0000 Subject: [PATCH 4/4] addressed comments --- bigframes/core/compile/scalar_op_compiler.py | 3 ++- tests/system/small/test_series.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 6e5185dd5d..67761c0330 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -673,7 +673,8 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): if x.type() == ibis_dtypes.int64: # The conversion unit is set to "us" (microseconds) for consistency - # with pandas converting timestamp[us][pyarrow] to int64[pyarrow]. + # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], + # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. unit = "us" x_converted = numeric_to_datatime(x, unit) if to_type == ibis_dtypes.timestamp: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3627e8249c..e22037a1ce 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2679,6 +2679,7 @@ def test_date_time_astype_int( bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert bf_result.dtype == "Int64" def test_string_astype_int(): @@ -2770,6 +2771,7 @@ def test_timestamp_astype_string(): pd.testing.assert_series_equal( bf_result, expected_result, check_index_type=False, check_dtype=False ) + assert bf_result.dtype == "string[pyarrow]" @pytest.mark.parametrize(