From 264becbd49eb9d6854768be492454f72fd7fb064 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 24 Jan 2024 02:38:11 +0000 Subject: [PATCH 01/10] feat: limited support of lamdas in `Series.apply` --- bigframes/core/compile/scalar_op_compiler.py | 4 --- tests/system/small/test_series.py | 29 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index bf0755acc7..a30cb676d3 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -658,10 +658,6 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): - if not hasattr(op.func, "bigframes_remote_function"): - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) x_transformed = op.func(x) if not op.apply_on_null: x_transformed = ibis.case().when(x.isnull(), x).else_(x_transformed).end() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 69b35d102c..e2a97c03d8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2997,3 +2997,32 @@ def test_series_iter( scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] ): assert bf_i == pd_i + + +@pytest.mark.parametrize( + ("lambda_",), + [ + pytest.param(lambda x: x * x + x + 1), + pytest.param( + lambda x: f"I got {x}", + marks=pytest.mark.xfail( + raises=AttributeError, + ), + ), + ], + ids=[ + "lamda_arithmatic", + "lambda_arbitrary", + ], +) +def test_apply_lambda(scalars_dfs, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.apply(lambda_).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) From 266a3ea2e4f6e25e04032e896c773d0860666e98 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 25 Jan 2024 02:37:45 +0000 Subject: [PATCH 02/10] add code sample for non-remote-function `Series.apply` --- .../bigframes_vendored/pandas/core/series.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1aa4ffffbb..203c53c334 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1152,6 +1152,42 @@ def apply( >>> names = bpd.Series(["Alice", "Bob"]) >>> hashes = names.apply(get_hash) + There is a limited support of simple functions and lambdas which can be + operated directly (without converting into a `remote_function`) on the + BigQuery DataFrames objects. + + .. note:: + Bigframes does not yet support ``dict`` subclasses that define + ``__missing__`` (i.e. provide a method for default values). These + are treated the same as ``dict``. + + This approach takes advantage of a nuance in the way BigQuery DataFrames + objects are modelled internally and works only if the function body + contains only arithmatic or logical operators. + + >>> nums = bpd.Series([1, 2, 3, 4]) + >>> nums + 0 1 + 1 2 + 2 3 + 3 4 + dtype: Int64 + >>> nums.apply(lambda x: x*x + 2*x + 1) + 0 4 + 1 9 + 2 16 + 3 25 + dtype: Int64 + + >>> def is_odd(num): + ... return num % 2 == 1 + >>> nums.apply(is_odd) + 0 True + 1 False + 2 True + 3 False + dtype: boolean + Args: func (function): BigFrames DataFrames ``remote_function`` to apply. The function From d3f9878076fccc901e550e8a5e69c335f79de42a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 25 Jan 2024 08:42:55 +0000 Subject: [PATCH 03/10] remove ..note in the middle of code samples due to rendering issue --- .../bigframes_vendored/pandas/core/series.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 203c53c334..a6de38236d 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1154,16 +1154,9 @@ def apply( There is a limited support of simple functions and lambdas which can be operated directly (without converting into a `remote_function`) on the - BigQuery DataFrames objects. - - .. note:: - Bigframes does not yet support ``dict`` subclasses that define - ``__missing__`` (i.e. provide a method for default values). These - are treated the same as ``dict``. - - This approach takes advantage of a nuance in the way BigQuery DataFrames - objects are modelled internally and works only if the function body - contains only arithmatic or logical operators. + BigQuery DataFrames objects. This approach takes advantage of a nuance + in the way BigQuery DataFrames objects are modeled internally and works + only if the function body contains only arithmatic or logical operators. >>> nums = bpd.Series([1, 2, 3, 4]) >>> nums From 89b8dd1cbc704b545bbc8d1b7b52e996f2c5f3d8 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 25 Jan 2024 22:54:14 +0000 Subject: [PATCH 04/10] fix typo --- tests/system/small/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e2a97c03d8..55e85528ae 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3011,7 +3011,7 @@ def test_series_iter( ), ], ids=[ - "lamda_arithmatic", + "lambda_arithmatic", "lambda_arbitrary", ], ) From 4ed8f7b068466bf9c7621b2ede937d889aacf165 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 26 Jan 2024 01:56:45 +0000 Subject: [PATCH 05/10] add lambda test coverage and code samples for `Series.mask` --- tests/system/small/test_series.py | 61 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 32 +++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 55e85528ae..f12a00881e 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2515,6 +2515,51 @@ def test_mask_custom_value(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("lambda_",), + [ + pytest.param(lambda x: x > 0), + pytest.param( + lambda x: True if x > 0 else False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "lambda_arithmatic", + "lambda_arbitrary", + ], +) +def test_mask_lambda(scalars_dfs, lambda_): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.apply(lambda_).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(lambda_) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_mask_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x < 1000000 + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.apply(foo).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("column", "to_type"), [ @@ -3026,3 +3071,19 @@ def test_apply_lambda(scalars_dfs, lambda_): # ignore dtype check, which are Int64 and object respectively assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_apply_simple_udf(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + def foo(x): + return x * x + 2 * x + 3 + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.apply(foo).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(foo) + + # ignore dtype check, which are Int64 and object respectively + assert_series_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a6de38236d..6ece262812 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2625,7 +2625,8 @@ def mask(self, cond, other): dtype: Int64 You can mask the values in the Series based on a condition. The values - matching the condition would be masked. + matching the condition would be masked. The condition can be provided in + formm of a Series. >>> s.mask(s % 2 == 0) 0 @@ -2681,6 +2682,35 @@ def mask(self, cond, other): 2 Caroline dtype: string + There is a limited support of simple functions and lambdas which can be + operated directly (without converting into a `remote_function`) on the + BigQuery DataFrames objects. This approach takes advantage of a nuance + in the way BigQuery DataFrames objects are modeled internally and works + only if the function body contains only arithmatic or logical operators. + + >>> nums = bpd.Series([1, 2, 3, 4], name="nums") + >>> nums + 0 1 + 1 2 + 2 3 + 3 4 + Name: nums, dtype: Int64 + >>> nums.mask(lambda x: (x+1) % 2 == 1) + 0 1 + 1 + 2 3 + 3 + Name: nums, dtype: Int64 + + >>> def is_odd(num): + ... return num % 2 == 1 + >>> nums.mask(is_odd) + 0 + 1 2 + 2 + 3 4 + Name: nums, dtype: Int64 + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is False, keep the original value. Where True, replace From 2603ba138d41fdc89f3a3bfe4f243a4a71fa55ab Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 31 Jan 2024 23:47:17 +0000 Subject: [PATCH 06/10] apply the non-remote function on series level --- bigframes/core/compile/scalar_op_compiler.py | 4 +++ bigframes/series.py | 8 +++++ tests/system/small/test_series.py | 32 +++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 35 +++++++++++-------- 4 files changed, 64 insertions(+), 15 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a30cb676d3..bf0755acc7 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -658,6 +658,10 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): + if not hasattr(op.func, "bigframes_remote_function"): + raise TypeError( + f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" + ) x_transformed = op.func(x) if not op.apply_on_null: x_transformed = ibis.case().when(x.isnull(), x).else_(x_transformed).end() diff --git a/bigframes/series.py b/bigframes/series.py index c802fd2467..a208336528 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1216,6 +1216,14 @@ def apply(self, func) -> Series: # Reproject as workaround to applying filter too late. This forces the filter # to be applied before passing data to remote function, protecting from bad # inputs causing errors. + if not callable(func): + raise ValueError( + "Only a ufunc (a NumPy function that applies to the entire Series) or a remote function that only works on single values are supported." + ) + + if not hasattr(func, "bigframes_remote_function"): + return func(self) + reprojected_series = Series(self._block._force_reproject()) return reprojected_series._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 65e7b40877..93183f2b96 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3088,10 +3088,17 @@ def test_series_iter( raises=AttributeError, ), ), + pytest.param( + {1: 2, 3: 4}, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), ], ids=[ "lambda_arithmatic", "lambda_arbitrary", + "not_lambda", ], ) def test_apply_lambda(scalars_dfs, lambda_): @@ -3107,6 +3114,31 @@ def test_apply_lambda(scalars_dfs, lambda_): assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.log), + pytest.param(numpy.sqrt), + pytest.param(numpy.sin), + ], + ids=[ + "log", + "sqrt", + "sin", + ], +) +def test_apply_numpy_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"] + bf_result = bf_col.apply(ufunc).to_pandas() + + pd_col = scalars_pandas_df["int64_col"] + pd_result = pd_col.apply(ufunc) + + assert_series_equal(bf_result, pd_result) + + def test_apply_simple_udf(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 79adb966cd..3713cefb53 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1099,14 +1099,19 @@ def apply( """ Invoke function on values of a Series. + Can be ufunc (a NumPy function that applies to the entire Series) or a + Python function that only works on single values. If it is an arbitrary + python function then converting it into a `remote_function` is recommended. + **Examples:** >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + For applying arbitrary python function a `remote_funciton` is recommended. + Let's use ``reuse=False`` flag to make sure a new `remote_function` is created every time we run the following code, but you can skip it - to potentially reuse a previously deployed ``remote_function`` from + to potentially reuse a previously deployed `remote_function` from the same user defined function. >>> @bpd.remote_function([int], float, reuse=False) @@ -1131,9 +1136,9 @@ def apply( 4 2.0 dtype: Float64 - You could turn a user defined function with external package - dependencies into a BigQuery DataFrames remote function. You would - provide the names of the packages via ``packages`` param. + To turn a user defined function with external package dependencies into + a `remote_function`, you would provide the names of the packages via + `packages` param. >>> @bpd.remote_function( ... [str], @@ -1155,11 +1160,7 @@ def apply( >>> names = bpd.Series(["Alice", "Bob"]) >>> hashes = names.apply(get_hash) - There is a limited support of simple functions and lambdas which can be - operated directly (without converting into a `remote_function`) on the - BigQuery DataFrames objects. This approach takes advantage of a nuance - in the way BigQuery DataFrames objects are modeled internally and works - only if the function body contains only arithmatic or logical operators. + Simple functions, lambdas or ufuncs can be applied directly. >>> nums = bpd.Series([1, 2, 3, 4]) >>> nums @@ -1184,6 +1185,13 @@ def apply( 3 False dtype: boolean + >>> nums.apply(np.log) + 0 0.0 + 1 0.693147 + 2 1.098612 + 3 1.386294 + dtype: Float64 + Args: func (function): BigFrames DataFrames ``remote_function`` to apply. The function @@ -2745,11 +2753,8 @@ def mask(self, cond, other): 2 Caroline dtype: string - There is a limited support of simple functions and lambdas which can be - operated directly (without converting into a `remote_function`) on the - BigQuery DataFrames objects. This approach takes advantage of a nuance - in the way BigQuery DataFrames objects are modeled internally and works - only if the function body contains only arithmatic or logical operators. + Simple lambdas or python functions can be used as long as they only + perform operations supported on a Series. >>> nums = bpd.Series([1, 2, 3, 4], name="nums") >>> nums From e782877110048cc73cc20325ae2c930220968d6f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 9 Feb 2024 21:45:43 +0000 Subject: [PATCH 07/10] add suggestion to use remote function if direct func errors out --- bigframes/series.py | 16 ++++++++- tests/system/small/test_series.py | 54 +++++++++++++++++++------------ 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 5898a243a4..4aa0f5e8b2 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -58,6 +58,12 @@ LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] +_remote_function_recommendation_message = ( + "Your functions could not be applied directly to the Series." + " Try converting it to a remote function." +) + + @log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): def __init__(self, *args, **kwargs): @@ -1222,7 +1228,15 @@ def apply(self, func) -> Series: ) if not hasattr(func, "bigframes_remote_function"): - return func(self) + try: + return func(self) + except Exception as ex: + # This could happen if any of the operators in func is not + # supported on a Series. Let's guide the customer to use a + # remote function instead + if hasattr(ex, "message"): + ex.message += "\n{_remote_function_recommendation_message}" + raise reprojected_series = Series(self._block._force_reproject()) return reprojected_series._apply_unary_op( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c0376e069b..b0b176c4e8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3090,35 +3090,28 @@ def test_series_iter( @pytest.mark.parametrize( - ("lambda_",), + ( + "col", + "lambda_", + ), [ - pytest.param(lambda x: x * x + x + 1), - pytest.param( - lambda x: f"I got {x}", - marks=pytest.mark.xfail( - raises=AttributeError, - ), - ), - pytest.param( - {1: 2, 3: 4}, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), + pytest.param("int64_col", lambda x: x * x + x + 1), + pytest.param("int64_col", lambda x: x % 2 == 1), + pytest.param("string_col", lambda x: x + "_suffix"), ], ids=[ - "lambda_arithmatic", - "lambda_arbitrary", - "not_lambda", + "lambda_int_int", + "lambda_int_bool", + "lambda_str_str", ], ) -def test_apply_lambda(scalars_dfs, lambda_): +def test_apply_lambda(scalars_dfs, col, lambda_): scalars_df, scalars_pandas_df = scalars_dfs - bf_col = scalars_df["int64_col"] + bf_col = scalars_df[col] bf_result = bf_col.apply(lambda_).to_pandas() - pd_col = scalars_pandas_df["int64_col"] + pd_col = scalars_pandas_df[col] pd_result = pd_col.apply(lambda_) # ignore dtype check, which are Int64 and object respectively @@ -3164,3 +3157,24 @@ def foo(x): # ignore dtype check, which are Int64 and object respectively assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("col", "lambda_", "exception"), + [ + pytest.param("int64_col", {1: 2, 3: 4}, ValueError), + pytest.param("int64_col", numpy.square, TypeError), + pytest.param("string_col", lambda x: x.capitalize(), AttributeError), + ], + ids=[ + "not_callable", + "numpy_ufunc", + "custom_lambda", + ], +) +def test_apply_not_supported(scalars_dfs, col, lambda_, exception): + scalars_df, _ = scalars_dfs + + bf_col = scalars_df[col] + with pytest.raises(exception): + bf_col.apply(lambda_) From 4357d4f086f96eee2f862335ef9c9f31e342f70d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 9 Feb 2024 22:50:36 +0000 Subject: [PATCH 08/10] support by_row param in Series.apply --- bigframes/series.py | 19 ++++++++++++--- tests/system/small/test_series.py | 23 +++++++++++++++---- .../bigframes_vendored/pandas/core/series.py | 14 +++++++---- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 4aa0f5e8b2..012fd3fd5f 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1216,18 +1216,31 @@ def _groupby_values( dropna=dropna, ) - def apply(self, func) -> Series: + def apply( + self, func, by_row: typing.Union[typing.Literal["compat"], bool] = "compat" + ) -> Series: # TODO(shobs, b/274645634): Support convert_dtype, args, **kwargs # is actually a ternary op # Reproject as workaround to applying filter too late. This forces the filter # to be applied before passing data to remote function, protecting from bad # inputs causing errors. + + if by_row not in ["compat", False]: + raise ValueError("Param by_row must be one of 'compat' or False") + if not callable(func): raise ValueError( - "Only a ufunc (a NumPy function that applies to the entire Series) or a remote function that only works on single values are supported." + "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." ) if not hasattr(func, "bigframes_remote_function"): + # It is not a remote function + # Then it must be a vectorized function that applies to the Series + # as a whole + assert ( + not by_row + ), "A vectorized non-remote function can be provided only with by_row=False" + try: return func(self) except Exception as ex: @@ -1235,7 +1248,7 @@ def apply(self, func) -> Series: # supported on a Series. Let's guide the customer to use a # remote function instead if hasattr(ex, "message"): - ex.message += "\n{_remote_function_recommendation_message}" + ex.message += f"\n{_remote_function_recommendation_message}" raise reprojected_series = Series(self._block._force_reproject()) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index b0b176c4e8..25cbc33e31 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3109,7 +3109,12 @@ def test_apply_lambda(scalars_dfs, col, lambda_): scalars_df, scalars_pandas_df = scalars_dfs bf_col = scalars_df[col] - bf_result = bf_col.apply(lambda_).to_pandas() + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(AssertionError, match="by_row=False"): + bf_col.apply(lambda_) + + bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() pd_col = scalars_pandas_df[col] pd_result = pd_col.apply(lambda_) @@ -3135,7 +3140,12 @@ def test_apply_numpy_ufunc(scalars_dfs, ufunc): scalars_df, scalars_pandas_df = scalars_dfs bf_col = scalars_df["int64_col"] - bf_result = bf_col.apply(ufunc).to_pandas() + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(AssertionError, match="by_row=False"): + bf_col.apply(ufunc) + + bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() pd_col = scalars_pandas_df["int64_col"] pd_result = pd_col.apply(ufunc) @@ -3150,7 +3160,12 @@ def foo(x): return x * x + 2 * x + 3 bf_col = scalars_df["int64_col"] - bf_result = bf_col.apply(foo).to_pandas() + + # Can't be applied to BigFrames Series without by_row=False + with pytest.raises(AssertionError, match="by_row=False"): + bf_col.apply(foo) + + bf_result = bf_col.apply(foo, by_row=False).to_pandas() pd_col = scalars_pandas_df["int64_col"] pd_result = pd_col.apply(foo) @@ -3177,4 +3192,4 @@ def test_apply_not_supported(scalars_dfs, col, lambda_, exception): bf_col = scalars_df[col] with pytest.raises(exception): - bf_col.apply(lambda_) + bf_col.apply(lambda_, by_row=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 2099d2be50..dc1b8014e9 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1116,6 +1116,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: def apply( self, func, + by_row="compat", ) -> DataFrame | Series: """ Invoke function on values of a Series. @@ -1181,7 +1182,8 @@ def apply( >>> names = bpd.Series(["Alice", "Bob"]) >>> hashes = names.apply(get_hash) - Simple functions, lambdas or ufuncs can be applied directly. + Simple vectorized functions, lambdas or ufuncs can be applied directly + with `by_row=False`. >>> nums = bpd.Series([1, 2, 3, 4]) >>> nums @@ -1190,7 +1192,7 @@ def apply( 2 3 3 4 dtype: Int64 - >>> nums.apply(lambda x: x*x + 2*x + 1) + >>> nums.apply(lambda x: x*x + 2*x + 1, by_row=False) 0 4 1 9 2 16 @@ -1199,14 +1201,14 @@ def apply( >>> def is_odd(num): ... return num % 2 == 1 - >>> nums.apply(is_odd) + >>> nums.apply(is_odd, by_row=False) 0 True 1 False 2 True 3 False dtype: boolean - >>> nums.apply(np.log) + >>> nums.apply(np.log, by_row=False) 0 0.0 1 0.693147 2 1.098612 @@ -1218,6 +1220,10 @@ def apply( BigFrames DataFrames ``remote_function`` to apply. The function should take a scalar and return a scalar. It will be applied to every element in the ``Series``. + by_row (False or "compat", default "compat"): + If `"compat"` , func must be a remote function which will be + passed each element of the Series, like `Series.map`. If False, + the func will be passed the whole Series at once. Returns: bigframes.series.Series: A new Series with values representing the From 3b51709fc0ee4b3c8bd5a14721bfb3155b067e87 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 9 Feb 2024 23:08:20 +0000 Subject: [PATCH 09/10] raise ValueError instead of AssertionError --- bigframes/series.py | 7 ++++--- tests/system/small/test_series.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 012fd3fd5f..14dc1fc504 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1237,9 +1237,10 @@ def apply( # It is not a remote function # Then it must be a vectorized function that applies to the Series # as a whole - assert ( - not by_row - ), "A vectorized non-remote function can be provided only with by_row=False" + if by_row: + raise ValueError( + "A vectorized non-remote function can be provided only with by_row=False" + ) try: return func(self) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 25cbc33e31..ed44ea35a5 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3111,7 +3111,7 @@ def test_apply_lambda(scalars_dfs, col, lambda_): bf_col = scalars_df[col] # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(AssertionError, match="by_row=False"): + with pytest.raises(ValueError, match="by_row=False"): bf_col.apply(lambda_) bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() @@ -3142,7 +3142,7 @@ def test_apply_numpy_ufunc(scalars_dfs, ufunc): bf_col = scalars_df["int64_col"] # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(AssertionError, match="by_row=False"): + with pytest.raises(ValueError, match="by_row=False"): bf_col.apply(ufunc) bf_result = bf_col.apply(ufunc, by_row=False).to_pandas() @@ -3162,7 +3162,7 @@ def foo(x): bf_col = scalars_df["int64_col"] # Can't be applied to BigFrames Series without by_row=False - with pytest.raises(AssertionError, match="by_row=False"): + with pytest.raises(ValueError, match="by_row=False"): bf_col.apply(foo) bf_result = bf_col.apply(foo, by_row=False).to_pandas() From 33d8b8b7832f033ee15df70fae85f4c99f937a22 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Sat, 10 Feb 2024 00:22:08 +0000 Subject: [PATCH 10/10] fix Series.mask tests --- bigframes/series.py | 9 +++++++-- tests/system/small/test_series.py | 8 ++++---- third_party/bigframes_vendored/pandas/core/series.py | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 14dc1fc504..4aef959a76 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1239,7 +1239,8 @@ def apply( # as a whole if by_row: raise ValueError( - "A vectorized non-remote function can be provided only with by_row=False" + "A vectorized non-remote function can be provided only with by_row=False." + " For element-wise operation it must be a remote function." ) try: @@ -1361,7 +1362,11 @@ def duplicated(self, keep: str = "first") -> Series: def mask(self, cond, other=None) -> Series: if callable(cond): - cond = self.apply(cond) + if hasattr(cond, "bigframes_remote_function"): + cond = self.apply(cond) + else: + # For non-remote function assume that it is applicable on Series + cond = self.apply(cond, by_row=False) if not isinstance(cond, Series): raise TypeError( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index ed44ea35a5..42651ed96f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2580,10 +2580,10 @@ def test_mask_lambda(scalars_dfs, lambda_): scalars_df, scalars_pandas_df = scalars_dfs bf_col = scalars_df["int64_col"] - bf_result = bf_col.apply(lambda_).to_pandas() + bf_result = bf_col.mask(lambda_).to_pandas() pd_col = scalars_pandas_df["int64_col"] - pd_result = pd_col.apply(lambda_) + pd_result = pd_col.mask(lambda_) # ignore dtype check, which are Int64 and object respectively assert_series_equal(bf_result, pd_result, check_dtype=False) @@ -2596,10 +2596,10 @@ def foo(x): return x < 1000000 bf_col = scalars_df["int64_col"] - bf_result = bf_col.apply(foo).to_pandas() + bf_result = bf_col.mask(foo).to_pandas() pd_col = scalars_pandas_df["int64_col"] - pd_result = pd_col.apply(foo) + pd_result = pd_col.mask(foo) # ignore dtype check, which are Int64 and object respectively assert_series_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index dc1b8014e9..b203471606 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2780,8 +2780,8 @@ def mask(self, cond, other): 2 Caroline dtype: string - Simple lambdas or python functions can be used as long as they only - perform operations supported on a Series. + Simple vectorized (i.e. they only perform operations supported on a + Series) lambdas or python functions can be used directly. >>> nums = bpd.Series([1, 2, 3, 4], name="nums") >>> nums