From 4bc27533527a9c132f6c9b5f6925bfe0926a99e6 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 6 Dec 2023 23:12:43 +0000
Subject: [PATCH 1/6] feat: add replace method to DataFrame

---
 bigframes/dataframe.py                        | 17 ++++-
 bigframes/dtypes.py                           | 47 +++++++++++++
 bigframes/series.py                           | 67 +++++++++++--------
 tests/system/small/test_dataframe.py          | 44 ++++++++++++
 .../bigframes_vendored/pandas/core/frame.py   | 65 ++++++++++++++++++
 5 files changed, 211 insertions(+), 29 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 3b0fd7008a..b956b05a68 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1561,6 +1561,21 @@ def interpolate(self, method: str = "linear") -> DataFrame:
     def fillna(self, value=None) -> DataFrame:
         return self._apply_binop(value, ops.fillna_op, how="left")
 
+    def replace(
+        self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
+    ):
+        if utils.is_dict_like(value):
+            return self.apply(
+                lambda x: x.replace(
+                    to_replace=to_replace, value=value[x.name], regex=regex
+                )
+                if (x.name in value)
+                else x
+            )
+        return self.apply(
+            lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
+        )
+
     def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
         window = bigframes.core.WindowSpec(preceding=limit, following=0)
         return self._apply_window_op(agg_ops.LastNonNullOp(), window)
@@ -1850,7 +1865,7 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    def describe(self) -> DataFrame:
+    def describe(self, *, include=None, exclude=None) -> DataFrame:
         df_numeric = self._drop_non_numeric(keep_bool=False)
         if len(df_numeric.columns) == 0:
             raise NotImplementedError(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 774eb74d06..643e115a42 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -14,6 +14,7 @@
 
 """Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
 
+import datetime
 import textwrap
 import typing
 from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -437,3 +438,49 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
                 gcb3p_pandas_helpers.bq_to_arrow_data_type(field)
             )
     return dtypes
+
+
+def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
+    """Captures whether a scalar can be losslessly represented by a dtype."""
+    if scalar is None:
+        return True
+    if pd.api.types.is_bool_dtype(dtype):
+        return pd.api.types.is_bool(scalar)
+    if pd.api.types.is_float_dtype(dtype):
+        return pd.api.types.is_float(scalar)
+    if pd.api.types.is_int64_dtype(dtype):
+        return pd.api.types.is_integer(scalar)
+    if isinstance(dtype, pd.StringDtype):
+        return isinstance(scalar, str)
+    if isinstance(dtype, pd.ArrowDtype):
+        pa_type = dtype.pyarrow_dtype
+        return is_patype(scalar, pa_type)
+    return False
+
+
+def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
+    if pa_type == pa.time64("us"):
+        return isinstance(scalar, datetime.time)
+    if pa_type == pa.timestamp("us"):
+        if isinstance(scalar, datetime.datetime):
+            return not scalar.tzinfo
+        if isinstance(scalar, pd.Timestamp):
+            return not scalar.tzinfo
+    if pa_type == pa.timestamp("us", tz="UTC"):
+        if isinstance(scalar, datetime.datetime):
+            return scalar.tzinfo == datetime.timezone.utc
+        if isinstance(scalar, pd.Timestamp):
+            return scalar.tzinfo == datetime.timezone.utc
+    if pa_type == pa.date32():
+        return isinstance(scalar, datetime.date)
+    return False
+
+
+def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
+    """Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
+    if is_dtype(scalar, dtype):
+        return True
+    elif pd.api.types.is_numeric_dtype(dtype):
+        return pd.api.types.is_number(scalar)
+    else:
+        return False
diff --git a/bigframes/series.py b/bigframes/series.py
index c929775a00..9012fb9ade 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -442,42 +442,53 @@ def replace(
         self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
     ):
         if regex:
-            if not (isinstance(to_replace, str) and isinstance(value, str)):
-                raise NotImplementedError(
-                    f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}"
-                )
-            block, result_col = self._block.apply_unary_op(
-                self._value_column,
-                ops.ReplaceRegexOp(to_replace, value),
-                result_label=self.name,
-            )
-            return Series(block.select_column(result_col))
+            # No-op unless to_replace and series dtype are both string type
+            if not isinstance(to_replace, str) or not isinstance(
+                self.dtype, pandas.StringDtype
+            ):
+                return self
+            return self._regex_replace(to_replace, value)
         elif utils.is_dict_like(to_replace):
             raise NotImplementedError(
                 f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
             )
         elif utils.is_list_like(to_replace):
-            block, cond = self._block.apply_unary_op(
-                self._value_column, ops.IsInOp(to_replace)
-            )
-            block, result_col = block.apply_binary_op(
-                cond,
-                self._value_column,
-                ops.partial_arg1(ops.where_op, value),
-                result_label=self.name,
-            )
-            return Series(block.select_column(result_col))
+            replace_list = to_replace
         else:  # Scalar
-            block, cond = self._block.apply_unary_op(
-                self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace)
+            replace_list = [to_replace]
+        replace_list = [
+            i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype)
+        ]
+        return self._simple_replace(replace_list, value) if replace_list else self
+
+    def _regex_replace(self, to_replace: str, value: str):
+        if not bigframes.dtypes.is_dtype(value, self.dtype):
+            raise NotImplementedError(
+                f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
             )
-            block, result_col = block.apply_binary_op(
-                cond,
-                self._value_column,
-                ops.partial_arg1(ops.where_op, value),
-                result_label=self.name,
+        block, result_col = self._block.apply_unary_op(
+            self._value_column,
+            ops.ReplaceRegexOp(to_replace, value),
+            result_label=self.name,
+        )
+        return Series(block.select_column(result_col))
+
+    def _simple_replace(self, to_replace_list: typing.Sequence, value):
+        if not bigframes.dtypes.is_dtype(value, self.dtype):
+            raise NotImplementedError(
+                f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
             )
-            return Series(block.select_column(result_col))
+
+        block, cond = self._block.apply_unary_op(
+            self._value_column, ops.IsInOp(to_replace_list)
+        )
+        block, result_col = block.apply_binary_op(
+            cond,
+            self._value_column,
+            ops.partial_arg1(ops.where_op, value),
+            result_label=self.name,
+        )
+        return Series(block.select_column(result_col))
 
     def interpolate(self, method: str = "linear") -> Series:
         if method == "pad":
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 45490e00ca..b0a96b5a6d 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+def test_df_replace_scalar_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
+    pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_regex_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
+    pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True)
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_list_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
+    pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_value_dict(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas()
+    pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200})
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
 def test_df_ffill(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 7168572705..164e210e1a 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4006,6 +4006,71 @@ def fillna(self, value):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def replace(
+        self,
+        to_replace,
+        value=None,
+        *,
+        regex=False,
+    ):
+        """
+        Replace values given in `to_replace` with `value`.
+
+        Values of the Series/DataFrame are replaced with other values dynamically.
+        This differs from updating with ``.loc`` or ``.iloc``, which require
+        you to specify a location to update with some value.
+
+        Args:
+            to_replace (str, regex, list, int, float or None):
+                How to find the values that will be replaced.
+
+                * numeric, str or regex:
+
+                    - numeric: numeric values equal to `to_replace` will be
+                      replaced with `value`
+                    - str: string exactly matching `to_replace` will be replaced
+                      with `value`
+                    - regex: regexs matching `to_replace` will be replaced with
+                      `value`
+
+                * list of str, regex, or numeric:
+
+                    - First, if `to_replace` and `value` are both lists, they
+                      **must** be the same length.
+                    - Second, if ``regex=True`` then all of the strings in **both**
+                      lists will be interpreted as regexs otherwise they will match
+                      directly. This doesn't matter much for `value` since there
+                      are only a few possible substitution regexes you can use.
+                    - str, regex and numeric rules apply as above.
+
+            value (scalar, default None):
+                Value to replace any values matching `to_replace` with.
+                For a DataFrame a dict of values can be used to specify which
+                value to use for each column (columns not in the dict will not be
+                filled). Regular expressions, strings and lists or dicts of such
+                objects are also allowed.
+            regex (bool, default False):
+                Whether to interpret `to_replace` and/or `value` as regular
+                expressions. If this is ``True`` then `to_replace` *must* be a
+                string.
+
+        Returns:
+            Series/DataFrame: Object after replacement.
+
+        Raises:
+            TypeError:
+                * If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
+                * If `to_replace` is a ``dict`` and `value` is not a ``list``,
+                  ``dict``, ``ndarray``, or ``Series``
+                * If `to_replace` is ``None`` and `regex` is not compilable
+                  into a regular expression or is a list, dict, ndarray, or
+                  Series.
+                * When replacing multiple ``bool`` or ``datetime64`` objects and
+                  the arguments to `to_replace` does not match the type of the
+                  value being replaced
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     @property
     def iloc(self):
         """Purely integer-location based indexing for selection by position."""

From 9c87404c97a4a55bbf0ccb87cc5421d7ffb8ca84 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 13 Dec 2023 22:47:45 +0000
Subject: [PATCH 2/6] remove unwanted change to describe method

---
 bigframes/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index b956b05a68..f8bcb79ab1 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1865,7 +1865,7 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    def describe(self, *, include=None, exclude=None) -> DataFrame:
+    def describe(self) -> DataFrame:
         df_numeric = self._drop_non_numeric(keep_bool=False)
         if len(df_numeric.columns) == 0:
             raise NotImplementedError(

From a5dda746fb844b21bbb9537f51922ddf2326aac3 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Fri, 15 Dec 2023 21:46:56 +0000
Subject: [PATCH 3/6] better docs

---
 bigframes/operations/__init__.py              | 14 ++++
 bigframes/series.py                           | 20 +++++-
 .../bigframes_vendored/pandas/core/frame.py   | 71 ++++++++++++++-----
 3 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index a29dd36c72..ff6af786bf 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -513,6 +513,20 @@ def _as_ibis(self, x: ibis_types.Value):
         return bigframes.dtypes.cast_ibis_value(x, self.to_type)
 
 
+class MapOp(UnaryOp):
+    def __init__(
+        self,
+        mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...],
+    ):
+        self._mappings = mappings
+
+    def _as_ibis(self, x: ibis_types.Value):
+        case = ibis.case()
+        for mapping in self._mappings:
+            case = case.when(x == mapping[0], mapping[1])
+        return case.else_(x).end()
+
+
 class FindOp(UnaryOp):
     def __init__(self, sub, start, end):
         self._sub = sub
diff --git a/bigframes/series.py b/bigframes/series.py
index 9012fb9ade..b5455fe7e9 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -449,9 +449,7 @@ def replace(
                 return self
             return self._regex_replace(to_replace, value)
         elif utils.is_dict_like(to_replace):
-            raise NotImplementedError(
-                f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
-            )
+            return self._mapping_replace(to_replace)  # type: ignore
         elif utils.is_list_like(to_replace):
             replace_list = to_replace
         else:  # Scalar
@@ -490,6 +488,22 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value):
         )
         return Series(block.select_column(result_col))
 
+    def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]):
+        tuples = []
+        for key, value in mapping.items():
+            if not bigframes.dtypes.is_comparable(key, self.dtype):
+                continue
+            if not bigframes.dtypes.is_dtype(value, self.dtype):
+                raise NotImplementedError(
+                    f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
+                )
+            tuples.append((key, value))
+
+        block, result = self._block.apply_unary_op(
+            self._value_column, ops.MapOp(tuple(tuples))
+        )
+        return Series(block.select_column(result))
+
     def interpolate(self, method: str = "linear") -> Series:
         if method == "pad":
             return self.ffill()
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 055227765a..b94ce7459e 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4370,28 +4370,63 @@ def replace(
         This differs from updating with ``.loc`` or ``.iloc``, which require
         you to specify a location to update with some value.
 
-        Args:
-            to_replace (str, regex, list, int, float or None):
-                How to find the values that will be replaced.
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'int_col': [1, 1, 2, 3],
+            ...     'string_col': ["a", "b", "c", "b"],
+            ...     })
 
-                * numeric, str or regex:
+        Using scalar `to_replace` and `value`:
 
-                    - numeric: numeric values equal to `to_replace` will be
-                      replaced with `value`
-                    - str: string exactly matching `to_replace` will be replaced
-                      with `value`
-                    - regex: regexs matching `to_replace` will be replaced with
-                      `value`
+            >>> df.replace("b", "e")
+               int_col string_col
+            0        1          a
+            1        1          e
+            2        2          c
+            3        3          e
+            <BLANKLINE>
+            [4 rows x 2 columns]
 
-                * list of str, regex, or numeric:
+        Using dictionary:
 
-                    - First, if `to_replace` and `value` are both lists, they
-                      **must** be the same length.
-                    - Second, if ``regex=True`` then all of the strings in **both**
-                      lists will be interpreted as regexs otherwise they will match
-                      directly. This doesn't matter much for `value` since there
-                      are only a few possible substitution regexes you can use.
-                    - str, regex and numeric rules apply as above.
+            >>> df.replace({"a": "e", 2: 5})
+               int_col string_col
+            0        1          e
+            1        1          b
+            2        5          c
+            3        3          b
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        Using regex:
+
+            >>> df.replace("[ab]", "e", regex=True)
+               int_col string_col
+            0        1          e
+            1        1          e
+            2        2          c
+            3        3          e
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+
+        Args:
+            to_replace (str, regex, list, int, float or None):
+                How to find the values that will be replaced.
+                numeric: numeric values equal to `to_replace` will be replaced with `value`
+                str: string exactly matching `to_replace` will be replaced with `value`
+                regex: regexs matching `to_replace` will be replaced with`value`
+                list of str, regex, or numeric:
+                First, if `to_replace` and `value` are both lists, they **must** be the same length.
+                Second, if ``regex=True`` then all of the strings in **both**
+                lists will be interpreted as regexs otherwise they will match
+                directly. This doesn't matter much for `value` since there
+                are only a few possible substitution regexes you can use.
+                str, regex and numeric rules apply as above.
 
             value (scalar, default None):
                 Value to replace any values matching `to_replace` with.

From 10364618f0f506bf290fe75299f845415d18bf53 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Fri, 15 Dec 2023 21:49:17 +0000
Subject: [PATCH 4/6] is_patype docstring

---
 bigframes/dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 643e115a42..b3bce0ec69 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -459,6 +459,7 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
 
 
 def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
+    """Determine whether a scalar's type matches a given pyarrow type."""
     if pa_type == pa.time64("us"):
         return isinstance(scalar, datetime.time)
     if pa_type == pa.timestamp("us"):

From fd5a5557e4ba1882b1413b3e8da7237e3d4940a7 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 19 Dec 2023 18:15:05 +0000
Subject: [PATCH 5/6] docstring fix

---
 third_party/bigframes_vendored/pandas/core/frame.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index b94ce7459e..00be9e5e9e 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4441,18 +4441,6 @@ def replace(
 
         Returns:
             Series/DataFrame: Object after replacement.
-
-        Raises:
-            TypeError:
-                * If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
-                * If `to_replace` is a ``dict`` and `value` is not a ``list``,
-                  ``dict``, ``ndarray``, or ``Series``
-                * If `to_replace` is ``None`` and `regex` is not compilable
-                  into a regular expression or is a list, dict, ndarray, or
-                  Series.
-                * When replacing multiple ``bool`` or ``datetime64`` objects and
-                  the arguments to `to_replace` does not match the type of the
-                  value being replaced
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 

From 4d2b6b3148d21a0bf65d4a04f5902ad90ac26df1 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 19 Dec 2023 18:54:18 +0000
Subject: [PATCH 6/6] mypy fix

---
 bigframes/dtypes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index b3bce0ec69..6dfcc17f37 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -448,7 +448,7 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
         return pd.api.types.is_bool(scalar)
     if pd.api.types.is_float_dtype(dtype):
         return pd.api.types.is_float(scalar)
-    if pd.api.types.is_int64_dtype(dtype):
+    if pd.api.types.is_integer_dtype(dtype):
         return pd.api.types.is_integer(scalar)
     if isinstance(dtype, pd.StringDtype):
         return isinstance(scalar, str)