From b634f6c6d4869ee226c8070e654af585c1e8c71e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 3 Apr 2024 22:07:00 +0000 Subject: [PATCH 1/3] feat: read_pandas accepts pandas Series and Index objects --- .pre-commit-config.yaml | 2 +- bigframes/pandas/__init__.py | 16 +++++++++++ bigframes/series.py | 2 +- bigframes/session/__init__.py | 45 +++++++++++++++++++++++++++--- tests/system/small/test_session.py | 15 ++++++++++ 5 files changed, 74 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 517176da89..af05f4423c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate] + additional_dependencies: [types-requests, types-tabulate, pandas-stubs] diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fc008f36e5..5d775b475b 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -32,6 +32,7 @@ Literal, MutableSequence, Optional, + overload, Sequence, Tuple, Union, @@ -577,7 +578,22 @@ def read_gbq_table( read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) +@overload def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: + ... + + +@overload +def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + +@overload +def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: + ... + + +def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]): return global_session.with_default_session( bigframes.session.Session.read_pandas, pandas_dataframe, diff --git a/bigframes/series.py b/bigframes/series.py index e7b358c2fe..7e2b0408b7 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1514,7 +1514,7 @@ def map( map_df = map_df.rename(columns={arg.name: self.name}) elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( - {"keys": list(arg.keys()), self.name: list(arg.values())}, + {"keys": list(arg.keys()), self.name: list(arg.values())}, # type: ignore session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ac266da3bd..fbb9a66ba3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -34,6 +34,7 @@ Mapping, MutableSequence, Optional, + overload, Sequence, Tuple, Union, @@ -95,7 +96,9 @@ # Avoid circular imports. if typing.TYPE_CHECKING: + import bigframes.core.indexes import bigframes.dataframe as dataframe + import bigframes.series _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" @@ -963,7 +966,23 @@ def read_gbq_model(self, model_name: str): model = self.bqclient.get_model(model_ref) return bigframes.ml.loader.from_bq(self, model) + @overload + def read_pandas( + self, pandas_dataframe: pandas.Index + ) -> bigframes.core.indexes.Index: + ... + + @overload + def read_pandas(self, pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + @overload def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame: + ... + + def read_pandas( + self, pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index] + ): """Loads DataFrame from a pandas DataFrame. The pandas DataFrame will be persisted as a temporary BigQuery table, which can be @@ -986,13 +1005,31 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame [2 rows x 2 columns] Args: - pandas_dataframe (pandas.DataFrame): - a pandas DataFrame object to be loaded. + pandas_dataframe (pandas.DataFrame, pandas.Series, or pandas.Index): + a pandas DataFrame/Series/Index object to be loaded. Returns: - bigframes.dataframe.DataFrame: The BigQuery DataFrame. + An equivalent bigframes.pandas.(DataFrame/Series/Index) object """ - return self._read_pandas(pandas_dataframe, "read_pandas") + import bigframes.series as series + + # Try to handle non-dataframe pandas objects as well + if isinstance(pandas_dataframe, pandas.Series): + bf_df = self._read_pandas(pandas.DataFrame(pandas_dataframe), "read_pandas") + bf_series = typing.cast(series.Series, bf_df[bf_df.columns[0]]) + # wrapping into df can set name to 0 so reset to original object name + bf_series.name = pandas_dataframe.name + return bf_series + if isinstance(pandas_dataframe, pandas.Index): + return self._read_pandas( + pandas.DataFrame(index=pandas_dataframe), "read_pandas" + ).index + if isinstance(pandas_dataframe, pandas.DataFrame): + return self._read_pandas(pandas_dataframe, "read_pandas") + else: + raise ValueError( + f"read_pandas() expects a pandas dataframe, but got a {type(pandas_dataframe)}" + ) def _read_pandas( self, pandas_dataframe: pandas.DataFrame, api_name: str diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 28a3f03860..eb6a0a8dd9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -421,6 +421,21 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_series(session): + idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) + bf_series = session.read_pandas(pd_series) + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_read_pandas_index(session): + pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + bf_idx = session.read_pandas(pd_idx) + + pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + def test_read_pandas_inline_respects_location(): options = bigframes.BigQueryOptions(location="europe-west1") session = bigframes.Session(options) From 0c2ed38360e310733f86027257534a7a3315080b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 3 Apr 2024 22:27:02 +0000 Subject: [PATCH 2/3] fix unit tests --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index fbb9a66ba3..9c1517632d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1028,7 +1028,7 @@ def read_pandas( return self._read_pandas(pandas_dataframe, "read_pandas") else: raise ValueError( - f"read_pandas() expects a pandas dataframe, but got a {type(pandas_dataframe)}" + f"read_pandas() expects a pandas.DataFrame, but got a {type(pandas_dataframe)}" ) def _read_pandas( From cd2db55944b66fb4e873dbe3c92d2de8c7898eaa Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 4 Apr 2024 00:09:16 +0000 Subject: [PATCH 3/3] fully qualify typing.overloa --- bigframes/pandas/__init__.py | 7 +++---- bigframes/session/__init__.py | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5d775b475b..4b0ac4310c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -32,7 +32,6 @@ Literal, MutableSequence, Optional, - overload, Sequence, Tuple, Union, @@ -578,17 +577,17 @@ def read_gbq_table( read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) -@overload +@typing.overload def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: ... -@overload +@typing.overload def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: ... -@overload +@typing.overload def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: ... diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9c1517632d..c7605e89d7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -34,7 +34,6 @@ Mapping, MutableSequence, Optional, - overload, Sequence, Tuple, Union, @@ -966,17 +965,17 @@ def read_gbq_model(self, model_name: str): model = self.bqclient.get_model(model_ref) return bigframes.ml.loader.from_bq(self, model) - @overload + @typing.overload def read_pandas( self, pandas_dataframe: pandas.Index ) -> bigframes.core.indexes.Index: ... - @overload + @typing.overload def read_pandas(self, pandas_dataframe: pandas.Series) -> bigframes.series.Series: ... - @overload + @typing.overload def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame: ...