From 1e0c39e06b14b9565746a8c138c3e051dd748a6b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 28 Mar 2024 22:36:58 +0000 Subject: [PATCH 1/7] feat: Allow DataFrame binary ops to align on either axis and with local objects. --- bigframes/core/normalize.py | 49 +++++++++++++++ bigframes/dataframe.py | 92 +++++++++++++++++++++------- bigframes/typing.py | 0 tests/system/small/test_dataframe.py | 31 ++++++++++ 4 files changed, 149 insertions(+), 23 deletions(-) create mode 100644 bigframes/core/normalize.py create mode 100644 bigframes/typing.py diff --git a/bigframes/core/normalize.py b/bigframes/core/normalize.py new file mode 100644 index 0000000000..f4a68364ba --- /dev/null +++ b/bigframes/core/normalize.py @@ -0,0 +1,49 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pandas as pd + +import bigframes.core.indexes.index as index +import bigframes.series as series + + +def normalize_to_bf_series(obj, default_index: index.Index) -> series.Series: + if isinstance(obj, series.Series): + return obj + if isinstance(obj, pd.Series): + return series.Series(obj) + if isinstance(obj, index.Index): + return series.Series(obj, default_index) + if isinstance(obj, pd.Index): + return series.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return series.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") + + +def normalize_to_pd_series(obj, default_index: pd.Index) -> pd.Series: + if isinstance(obj, series.Series): + return obj.to_pandas() + if isinstance(obj, pd.Series): + return obj + if isinstance(obj, index.Index): + return pd.Series(obj.to_pandas(), default_index) + if isinstance(obj, pd.Index): + return pd.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return pd.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 599546284b..5f78bd5b0c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -55,6 +55,7 @@ import bigframes.core.guid import bigframes.core.indexers as indexers import bigframes.core.indexes as indexes +import bigframes.core.normalize import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window @@ -663,22 +664,20 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int)): + if isinstance(other, (float, int, bool)): return self._apply_scalar_binop(other, op, reverse=reverse) - elif isinstance(other, indexes.Index): - return self._apply_series_binop( - other.to_series(index=self.index), - op, - axis=axis, - how=how, - reverse=reverse, - ) - elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop( - other, op, axis=axis, how=how, reverse=reverse - ) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) + elif isinstance(other, pandas.DataFrame): + return self._apply_dataframe_binop( + DataFrame(other), op, how=how, reverse=reverse + ) + elif utils.get_axis_number(axis) == 0: + input = bigframes.core.normalize.normalize_to_bf_series(other, self.index) + return self._apply_series_binop_axis_0(input, op, how, reverse) + elif utils.get_axis_number(axis) == 1: + input = bigframes.core.normalize.normalize_to_pd_series(other, self.columns) + return self._apply_series_binop_axis_1(input, op, how, reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" @@ -700,22 +699,13 @@ def _apply_scalar_binop( block = block.drop_columns([column_id]) return DataFrame(block) - def _apply_series_binop( + def _apply_series_binop_axis_0( self, other: bigframes.series.Series, op: ops.BinaryOp, - axis: str | int = "columns", how: str = "outer", reverse: bool = False, ) -> DataFrame: - if axis not in ("columns", "index", 0, 1): - raise ValueError(f"Invalid input: axis {axis}.") - - if axis in ("columns", 1): - raise NotImplementedError( - f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}" - ) - block, (get_column_left, get_column_right) = self._block.join( other._block, how=how ) @@ -738,6 +728,62 @@ def _apply_series_binop( block = block.with_index_labels(self.index.names) return DataFrame(block) + def _apply_series_binop_axis_1( + self, + other: pandas.Series, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, + ) -> DataFrame: + # join columns schema + # indexers will be none for exact match + if self.columns.equals(other.index): + columns, lcol_indexer, rcol_indexer = self.columns, None, None + else: + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.index, how=how, return_indexers=True + ) + + binop_result_ids = [] + + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), + ) + + block = self._block + for left_index, right_index in column_indices: + if left_index >= 0 and right_index >= 0: # -1 indices indicate missing + self_col_id = self._block.value_columns[left_index] + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(other_scalar)) + ) + elif left_index >= 0: + self_col_id = self._block.value_columns[left_index] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) + ) + elif right_index >= 0: + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), ex.const(None)) + if reverse + else op.as_expr(ex.const(None), ex.const(other_scalar)) + ) + else: + # Should not be possible + raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) + + block = block.select_columns(binop_result_ids) + return DataFrame(block.with_column_labels(columns)) + def _apply_dataframe_binop( self, other: DataFrame, diff --git a/bigframes/typing.py b/bigframes/typing.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 355849538e..07a2d208ac 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -27,6 +27,7 @@ import bigframes import bigframes._config.display_options as display_options +import bigframes.core.indexes.index as bf_indexes import bigframes.dataframe as dataframe import bigframes.series as series from tests.system.utils import ( @@ -2056,6 +2057,36 @@ def test_series_binop_axis_index( assert_pandas_df_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (bf_indexes.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + (series.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "bf_index", + "pd_series", + "bf_series", + ], +) +def test_listlike_binop_axis_1(scalars_dfs, input): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("left_labels", "right_labels"), [ From 78369f770a78e3945e2b1e96814eb5edb01bffa3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 1 Apr 2024 19:22:39 +0000 Subject: [PATCH 2/7] fix mypy issu --- bigframes/dataframe.py | 12 ++++++++---- tests/system/small/test_dataframe.py | 8 ++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5f78bd5b0c..bc9c3ad92f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -673,11 +673,15 @@ def _apply_binop( DataFrame(other), op, how=how, reverse=reverse ) elif utils.get_axis_number(axis) == 0: - input = bigframes.core.normalize.normalize_to_bf_series(other, self.index) - return self._apply_series_binop_axis_0(input, op, how, reverse) + bf_series = bigframes.core.normalize.normalize_to_bf_series( + other, self.index + ) + return self._apply_series_binop_axis_0(bf_series, op, how, reverse) elif utils.get_axis_number(axis) == 1: - input = bigframes.core.normalize.normalize_to_pd_series(other, self.columns) - return self._apply_series_binop_axis_1(input, op, how, reverse) + pd_series = bigframes.core.normalize.normalize_to_pd_series( + other, self.columns + ) + return self._apply_series_binop_axis_1(pd_series, op, how, reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 07a2d208ac..261fc223d9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -45,6 +45,14 @@ def test_df_construct_copy(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_large_inline(): + import uuid + + df = pd.DataFrame([str(uuid.uuid4()) * 500 for _ in range(4000)]) + + print(dataframe.DataFrame(df)) + + def test_df_construct_pandas_default(scalars_dfs): # This should trigger the inlined codepath columns = [ From 3d74bef43a04c58d2955665719bdd226dd7b788c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 1 Apr 2024 20:24:47 +0000 Subject: [PATCH 3/7] remove unwanted inlining test --- tests/system/small/test_dataframe.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0910cc4e7c..fc1ce6e0bc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -45,14 +45,6 @@ def test_df_construct_copy(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_df_large_inline(): - import uuid - - df = pd.DataFrame([str(uuid.uuid4()) * 500 for _ in range(4000)]) - - print(dataframe.DataFrame(df)) - - def test_df_construct_pandas_default(scalars_dfs): # This should trigger the inlined codepath columns = [ From ed4eef0844d822feadb1d2678f9ba5f131bc8b30 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 1 Apr 2024 23:10:25 +0000 Subject: [PATCH 4/7] only run new tests with newer pandas versions --- tests/system/small/test_dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index fc1ce6e0bc..9ba0aaf280 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2075,6 +2075,7 @@ def test_series_binop_axis_index( assert_pandas_df_equal(bf_result, pd_result) +@skip_legacy_pandas @pytest.mark.parametrize( ("input"), [ From f9e99a28da56f68f51d709a34fc1ae94dd626748 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 4 Apr 2024 00:41:19 +0000 Subject: [PATCH 5/7] add index uniqueness check --- bigframes/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b5ed65f5ed..6a472c2100 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -739,11 +739,12 @@ def _apply_series_binop_axis_1( how: str = "outer", reverse: bool = False, ) -> DataFrame: - # join columns schema - # indexers will be none for exact match + # Somewhat different alignment than df-df so separate codepath for now. if self.columns.equals(other.index): columns, lcol_indexer, rcol_indexer = self.columns, None, None else: + if not (self.columns.is_unique and other.index.is_unique): + raise ValueError("Cannot align non-unique indices") columns, lcol_indexer, rcol_indexer = self.columns.join( other.index, how=how, return_indexers=True ) From 420ea03caf4b2c9e95a614445ea2314adebe7ab6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 4 Apr 2024 00:53:22 +0000 Subject: [PATCH 6/7] fix index imports --- bigframes/core/normalize.py | 2 +- tests/system/small/test_dataframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/normalize.py b/bigframes/core/normalize.py index f4a68364ba..06385d67f3 100644 --- a/bigframes/core/normalize.py +++ b/bigframes/core/normalize.py @@ -15,7 +15,7 @@ import pandas as pd -import bigframes.core.indexes.index as index +import bigframes.core.indexes as index import bigframes.series as series diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9ba0aaf280..ae80a088b5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -27,7 +27,7 @@ import bigframes import bigframes._config.display_options as display_options -import bigframes.core.indexes.index as bf_indexes +import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe import bigframes.series as series from tests.system.utils import ( From 8128b84164d5335f83f114341211cb1baceaa989 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 4 Apr 2024 01:17:06 +0000 Subject: [PATCH 7/7] rename normalize to convert --- bigframes/core/{normalize.py => convert.py} | 4 ++-- bigframes/dataframe.py | 10 +++------- 2 files changed, 5 insertions(+), 9 deletions(-) rename bigframes/core/{normalize.py => convert.py} (91%) diff --git a/bigframes/core/normalize.py b/bigframes/core/convert.py similarity index 91% rename from bigframes/core/normalize.py rename to bigframes/core/convert.py index 06385d67f3..98f854ad72 100644 --- a/bigframes/core/normalize.py +++ b/bigframes/core/convert.py @@ -19,7 +19,7 @@ import bigframes.series as series -def normalize_to_bf_series(obj, default_index: index.Index) -> series.Series: +def to_bf_series(obj, default_index: index.Index) -> series.Series: if isinstance(obj, series.Series): return obj if isinstance(obj, pd.Series): @@ -34,7 +34,7 @@ def normalize_to_bf_series(obj, default_index: index.Index) -> series.Series: raise TypeError(f"Cannot interpret {obj} as series.") -def normalize_to_pd_series(obj, default_index: pd.Index) -> pd.Series: +def to_pd_series(obj, default_index: pd.Index) -> pd.Series: if isinstance(obj, series.Series): return obj.to_pandas() if isinstance(obj, pd.Series): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6a472c2100..97a100474a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -50,12 +50,12 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.convert import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid import bigframes.core.indexers as indexers import bigframes.core.indexes as indexes -import bigframes.core.normalize import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window @@ -673,14 +673,10 @@ def _apply_binop( DataFrame(other), op, how=how, reverse=reverse ) elif utils.get_axis_number(axis) == 0: - bf_series = bigframes.core.normalize.normalize_to_bf_series( - other, self.index - ) + bf_series = bigframes.core.convert.to_bf_series(other, self.index) return self._apply_series_binop_axis_0(bf_series, op, how, reverse) elif utils.get_axis_number(axis) == 1: - pd_series = bigframes.core.normalize.normalize_to_pd_series( - other, self.columns - ) + pd_series = bigframes.core.convert.to_pd_series(other, self.columns) return self._apply_series_binop_axis_1(pd_series, op, how, reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}."