From 4bc27533527a9c132f6c9b5f6925bfe0926a99e6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 6 Dec 2023 23:12:43 +0000 Subject: [PATCH 1/6] feat: add replace method to DataFrame --- bigframes/dataframe.py | 17 ++++- bigframes/dtypes.py | 47 +++++++++++++ bigframes/series.py | 67 +++++++++++-------- tests/system/small/test_dataframe.py | 44 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 65 ++++++++++++++++++ 5 files changed, 211 insertions(+), 29 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3b0fd7008a..b956b05a68 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1561,6 +1561,21 @@ def interpolate(self, method: str = "linear") -> DataFrame: def fillna(self, value=None) -> DataFrame: return self._apply_binop(value, ops.fillna_op, how="left") + def replace( + self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False + ): + if utils.is_dict_like(value): + return self.apply( + lambda x: x.replace( + to_replace=to_replace, value=value[x.name], regex=regex + ) + if (x.name in value) + else x + ) + return self.apply( + lambda x: x.replace(to_replace=to_replace, value=value, regex=regex) + ) + def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: window = bigframes.core.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) @@ -1850,7 +1865,7 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - def describe(self) -> DataFrame: + def describe(self, *, include=None, exclude=None) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: raise NotImplementedError( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 774eb74d06..643e115a42 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -14,6 +14,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +import datetime import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -437,3 +438,49 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: gcb3p_pandas_helpers.bq_to_arrow_data_type(field) ) return dtypes + + +def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: + """Captures whether a scalar can be losslessly represented by a dtype.""" + if scalar is None: + return True + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool(scalar) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float(scalar) + if pd.api.types.is_int64_dtype(dtype): + return pd.api.types.is_integer(scalar) + if isinstance(dtype, pd.StringDtype): + return isinstance(scalar, str) + if isinstance(dtype, pd.ArrowDtype): + pa_type = dtype.pyarrow_dtype + return is_patype(scalar, pa_type) + return False + + +def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: + if pa_type == pa.time64("us"): + return isinstance(scalar, datetime.time) + if pa_type == pa.timestamp("us"): + if isinstance(scalar, datetime.datetime): + return not scalar.tzinfo + if isinstance(scalar, pd.Timestamp): + return not scalar.tzinfo + if pa_type == pa.timestamp("us", tz="UTC"): + if isinstance(scalar, datetime.datetime): + return scalar.tzinfo == datetime.timezone.utc + if isinstance(scalar, pd.Timestamp): + return scalar.tzinfo == datetime.timezone.utc + if pa_type == pa.date32(): + return isinstance(scalar, datetime.date) + return False + + +def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" + if is_dtype(scalar, dtype): + return True + elif pd.api.types.is_numeric_dtype(dtype): + return pd.api.types.is_number(scalar) + else: + return False diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..9012fb9ade 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -442,42 +442,53 @@ def replace( self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False ): if regex: - if not (isinstance(to_replace, str) and isinstance(value, str)): - raise NotImplementedError( - f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}" - ) - block, result_col = self._block.apply_unary_op( - self._value_column, - ops.ReplaceRegexOp(to_replace, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + # No-op unless to_replace and series dtype are both string type + if not isinstance(to_replace, str) or not isinstance( + self.dtype, pandas.StringDtype + ): + return self + return self._regex_replace(to_replace, value) elif utils.is_dict_like(to_replace): raise NotImplementedError( f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}" ) elif utils.is_list_like(to_replace): - block, cond = self._block.apply_unary_op( - self._value_column, ops.IsInOp(to_replace) - ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + replace_list = to_replace else: # Scalar - block, cond = self._block.apply_unary_op( - self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace) + replace_list = [to_replace] + replace_list = [ + i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + ] + return self._simple_replace(replace_list, value) if replace_list else self + + def _regex_replace(self, to_replace: str, value: str): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = self._block.apply_unary_op( + self._value_column, + ops.ReplaceRegexOp(to_replace, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + + def _simple_replace(self, to_replace_list: typing.Sequence, value): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - return Series(block.select_column(result_col)) + + block, cond = self._block.apply_unary_op( + self._value_column, ops.IsInOp(to_replace_list) + ) + block, result_col = block.apply_binary_op( + cond, + self._value_column, + ops.partial_arg1(ops.where_op, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) def interpolate(self, method: str = "linear") -> Series: if method == "pad": diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 45490e00ca..b0a96b5a6d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_value_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() + pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + def test_df_ffill(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 7168572705..164e210e1a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4006,6 +4006,71 @@ def fillna(self, value): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def replace( + self, + to_replace, + value=None, + *, + regex=False, + ): + """ + Replace values given in `to_replace` with `value`. + + Values of the Series/DataFrame are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + Args: + to_replace (str, regex, list, int, float or None): + How to find the values that will be replaced. + + * numeric, str or regex: + + - numeric: numeric values equal to `to_replace` will be + replaced with `value` + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str, regex and numeric rules apply as above. + + value (scalar, default None): + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + regex (bool, default False): + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. + + Returns: + Series/DataFrame: Object after replacement. + + Raises: + TypeError: + * If `to_replace` is not a scalar, array-like, ``dict``, or ``None`` + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to `to_replace` does not match the type of the + value being replaced + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def iloc(self): """Purely integer-location based indexing for selection by position.""" From 9c87404c97a4a55bbf0ccb87cc5421d7ffb8ca84 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 13 Dec 2023 22:47:45 +0000 Subject: [PATCH 2/6] remove unwanted change to describe method --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b956b05a68..f8bcb79ab1 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1865,7 +1865,7 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - def describe(self, *, include=None, exclude=None) -> DataFrame: + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: raise NotImplementedError( From a5dda746fb844b21bbb9537f51922ddf2326aac3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 15 Dec 2023 21:46:56 +0000 Subject: [PATCH 3/6] better docs --- bigframes/operations/__init__.py | 14 ++++ bigframes/series.py | 20 +++++- .../bigframes_vendored/pandas/core/frame.py | 71 ++++++++++++++----- 3 files changed, 84 insertions(+), 21 deletions(-) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a29dd36c72..ff6af786bf 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -513,6 +513,20 @@ def _as_ibis(self, x: ibis_types.Value): return bigframes.dtypes.cast_ibis_value(x, self.to_type) +class MapOp(UnaryOp): + def __init__( + self, + mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...], + ): + self._mappings = mappings + + def _as_ibis(self, x: ibis_types.Value): + case = ibis.case() + for mapping in self._mappings: + case = case.when(x == mapping[0], mapping[1]) + return case.else_(x).end() + + class FindOp(UnaryOp): def __init__(self, sub, start, end): self._sub = sub diff --git a/bigframes/series.py b/bigframes/series.py index 9012fb9ade..b5455fe7e9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -449,9 +449,7 @@ def replace( return self return self._regex_replace(to_replace, value) elif utils.is_dict_like(to_replace): - raise NotImplementedError( - f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}" - ) + return self._mapping_replace(to_replace) # type: ignore elif utils.is_list_like(to_replace): replace_list = to_replace else: # Scalar @@ -490,6 +488,22 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): ) return Series(block.select_column(result_col)) + def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): + tuples = [] + for key, value in mapping.items(): + if not bigframes.dtypes.is_comparable(key, self.dtype): + continue + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) + tuples.append((key, value)) + + block, result = self._block.apply_unary_op( + self._value_column, ops.MapOp(tuple(tuples)) + ) + return Series(block.select_column(result)) + def interpolate(self, method: str = "linear") -> Series: if method == "pad": return self.ffill() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 055227765a..b94ce7459e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4370,28 +4370,63 @@ def replace( This differs from updating with ``.loc`` or ``.iloc``, which require you to specify a location to update with some value. - Args: - to_replace (str, regex, list, int, float or None): - How to find the values that will be replaced. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'int_col': [1, 1, 2, 3], + ... 'string_col': ["a", "b", "c", "b"], + ... }) - * numeric, str or regex: + Using scalar `to_replace` and `value`: - - numeric: numeric values equal to `to_replace` will be - replaced with `value` - - str: string exactly matching `to_replace` will be replaced - with `value` - - regex: regexs matching `to_replace` will be replaced with - `value` + >>> df.replace("b", "e") + int_col string_col + 0 1 a + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] - * list of str, regex, or numeric: + Using dictionary: - - First, if `to_replace` and `value` are both lists, they - **must** be the same length. - - Second, if ``regex=True`` then all of the strings in **both** - lists will be interpreted as regexs otherwise they will match - directly. This doesn't matter much for `value` since there - are only a few possible substitution regexes you can use. - - str, regex and numeric rules apply as above. + >>> df.replace({"a": "e", 2: 5}) + int_col string_col + 0 1 e + 1 1 b + 2 5 c + 3 3 b + + [4 rows x 2 columns] + + Using regex: + + >>> df.replace("[ab]", "e", regex=True) + int_col string_col + 0 1 e + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] + + + Args: + to_replace (str, regex, list, int, float or None): + How to find the values that will be replaced. + numeric: numeric values equal to `to_replace` will be replaced with `value` + str: string exactly matching `to_replace` will be replaced with `value` + regex: regexs matching `to_replace` will be replaced with`value` + list of str, regex, or numeric: + First, if `to_replace` and `value` are both lists, they **must** be the same length. + Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + str, regex and numeric rules apply as above. value (scalar, default None): Value to replace any values matching `to_replace` with. From 10364618f0f506bf290fe75299f845415d18bf53 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 15 Dec 2023 21:49:17 +0000 Subject: [PATCH 4/6] is_patype docstring --- bigframes/dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 643e115a42..b3bce0ec69 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -459,6 +459,7 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: + """Determine whether a scalar's type matches a given pyarrow type.""" if pa_type == pa.time64("us"): return isinstance(scalar, datetime.time) if pa_type == pa.timestamp("us"): From fd5a5557e4ba1882b1413b3e8da7237e3d4940a7 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 19 Dec 2023 18:15:05 +0000 Subject: [PATCH 5/6] docstring fix --- third_party/bigframes_vendored/pandas/core/frame.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b94ce7459e..00be9e5e9e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4441,18 +4441,6 @@ def replace( Returns: Series/DataFrame: Object after replacement. - - Raises: - TypeError: - * If `to_replace` is not a scalar, array-like, ``dict``, or ``None`` - * If `to_replace` is a ``dict`` and `value` is not a ``list``, - ``dict``, ``ndarray``, or ``Series`` - * If `to_replace` is ``None`` and `regex` is not compilable - into a regular expression or is a list, dict, ndarray, or - Series. - * When replacing multiple ``bool`` or ``datetime64`` objects and - the arguments to `to_replace` does not match the type of the - value being replaced """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 4d2b6b3148d21a0bf65d4a04f5902ad90ac26df1 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 19 Dec 2023 18:54:18 +0000 Subject: [PATCH 6/6] mypy fix --- bigframes/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index b3bce0ec69..6dfcc17f37 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -448,7 +448,7 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: return pd.api.types.is_bool(scalar) if pd.api.types.is_float_dtype(dtype): return pd.api.types.is_float(scalar) - if pd.api.types.is_int64_dtype(dtype): + if pd.api.types.is_integer_dtype(dtype): return pd.api.types.is_integer(scalar) if isinstance(dtype, pd.StringDtype): return isinstance(scalar, str)