From 664fb410e3a2d02172ac428d631a49e4d62c0332 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 20 Mar 2025 17:49:23 +0000 Subject: [PATCH 1/3] fix: Local data always has sequential index --- bigframes/core/indexes/base.py | 11 ++--- bigframes/dataframe.py | 2 +- bigframes/operations/base.py | 74 ++++++++++++------------------- tests/system/small/test_series.py | 9 +++- 4 files changed, 44 insertions(+), 52 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 900825996e..71dc914ed4 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -70,9 +70,7 @@ def __new__( elif isinstance(data, series.Series) or isinstance(data, Index): if isinstance(data, series.Series): block = data._block - block = block.set_index( - col_ids=[data._value_column], - ) + block = block.set_index(col_ids=[data._value_column]) elif isinstance(data, Index): block = data._block index = Index(data=block) @@ -493,7 +491,7 @@ def __getitem__(self, key: int) -> typing.Any: raise NotImplementedError(f"Index key not supported {key}") @overload - def to_pandas( + def to_pandas( # type: ignore[overload-overlap] self, *, allow_large_results: Optional[bool] = ..., @@ -508,7 +506,10 @@ def to_pandas( ... def to_pandas( - self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False + self, + *, + allow_large_results: Optional[bool] = None, + dry_run: bool = False, ) -> pandas.Index | pandas.Series: """Gets the Index as a pandas Index. diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 38f663f56f..4955bb1295 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1596,7 +1596,7 @@ def to_arrow( return pa_table @overload - def to_pandas( + def to_pandas( # type: ignore[overload-overlap] self, max_download_size: Optional[int] = ..., sampling_method: Optional[str] = ..., diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 75db2f48e9..46ca47450f 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -69,40 +69,12 @@ def __init__( raise ValueError( f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) + if isinstance(data, blocks.Block): - # Constructing from block is for internal use only - shouldn't use parameters, block encompasses all state - assert len(data.value_columns) == 1 - assert len(data.column_labels) == 1 - assert index is None - assert name is None - assert dtype is None block = data - - # interpret these cases as both index and data - elif isinstance(data, bigframes.pandas.Series) or pd.api.types.is_dict_like( - data - ): # includes pd.Series - if isinstance(data, bigframes.pandas.Series): - data = data.copy() - if name is not None: - data.name = name - if dtype is not None: - bf_dtype = bigframes.dtypes.bigframes_type(dtype) - data = data.astype(bf_dtype) - else: # local dict-like data - data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore - data_block = data._block - if index is not None: - # reindex - bf_index = indexes.Index(index, session=session) - idx_block = bf_index._block - idx_cols = idx_block.value_columns - block_idx, _ = idx_block.join(data_block, how="left") - data_block = block_idx.with_index_labels(bf_index.names) - block = data_block - - # list-like data that will get default index - elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data): + elif isinstance(data, SeriesMethods): + block = data._get_block() + elif isinstance(data, indexes.Index): data = indexes.Index(data, dtype=dtype, name=name, session=session) # set to none as it has already been applied, avoid re-cast later if data.nlevels != 1: @@ -111,8 +83,7 @@ def __init__( data_block = data._block.reset_index(drop=False).with_column_labels( data.names ) - if index is not None: - # Align by offset + if index is not None: # Align data and index by offset bf_index = indexes.Index(index, session=session) idx_block = bf_index._block.reset_index( drop=False @@ -121,19 +92,32 @@ def __init__( data_block, (l_mapping, _) = idx_block.join(data_block, how="left") data_block = data_block.set_index([l_mapping[col] for col in idx_cols]) data_block = data_block.with_index_labels(bf_index.names) + # prevents no-op reindex later + index = None block = data_block - else: # Scalar case - if index is not None: - bf_index = indexes.Index(index, session=session) - else: - bf_index = indexes.Index( - [] if (data is None) else [0], - session=session, - dtype=bigframes.dtypes.INT_DTYPE, - ) - block, _ = bf_index._block.create_constant(data, dtype) - block = block.with_column_labels([name]) + if block: + assert len(block.value_columns) == 1 + assert len(block.column_labels) == 1 + if index is not None: # reindexing operation + bf_index = indexes.Index(index) + idx_block = bf_index._block + idx_cols = idx_block.index_columns + block, _ = idx_block.join(block, how="left") + block = block.with_index_labels(bf_index.names) + if name: + block = block.with_column_labels([name]) + if dtype: + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) + else: + pd_series = pd.Series( + data=data, + index=index, # type:ignore + dtype=dtype, # type:ignore + name=name, + ) + block = read_pandas_func(pd_series)._get_block() # type:ignore assert block is not None self._block: blocks.Block = block diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index acd267aaf8..9bce47c20a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -140,7 +140,6 @@ def test_series_construct_reindex(): # BigQuery DataFrame default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) @@ -303,6 +302,14 @@ def test_series_construct_w_dtype_for_array_struct(): ) +def test_series_construct_local_unordered_has_sequential_index(unordered_session): + series = bigframes.pandas.Series( + ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session + ) + expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + pd.testing.assert_index_equal(series.index.to_pandas(), expected) + + def test_series_construct_w_dtype_for_json(): # Until b/401630655 is resolved, json, not compatible with allow_large_results=False with bigframes.option_context("bigquery.allow_large_results", True): From 8ad4aa73451a47c9b6ef587d763106950dd12489 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 20 Mar 2025 20:00:25 +0000 Subject: [PATCH 2/3] handle remote index local data case --- bigframes/operations/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 46ca47450f..5307595a7b 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -74,7 +74,7 @@ def __init__( block = data elif isinstance(data, SeriesMethods): block = data._get_block() - elif isinstance(data, indexes.Index): + elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index): data = indexes.Index(data, dtype=dtype, name=name, session=session) # set to none as it has already been applied, avoid re-cast later if data.nlevels != 1: From 8b614ac92d7a2f976430e9296727e573b61c05ae Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 26 Mar 2025 17:59:12 +0000 Subject: [PATCH 3/3] add data scalar bigframes.Index index case --- bigframes/operations/base.py | 10 ++++++++++ tests/system/small/test_series.py | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 5307595a7b..8d70596b7d 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -27,6 +27,7 @@ import bigframes.core.identifiers as ids import bigframes.core.indexes as indexes import bigframes.core.scalar as scalars +import bigframes.core.utils as bf_utils import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -74,6 +75,15 @@ def __init__( block = data elif isinstance(data, SeriesMethods): block = data._get_block() + # special case where data is local scalar, but index is bigframes index (maybe very big) + elif ( + not bf_utils.is_list_like(data) and not isinstance(data, indexes.Index) + ) and isinstance(index, indexes.Index): + block = index._block + block, _ = block.create_constant(data) + block = block.with_column_labels([None]) + # prevents no-op reindex later + index = None elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index): data = indexes.Index(data, dtype=dtype, name=name, session=session) # set to none as it has already been applied, avoid re-cast later diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 9bce47c20a..5ca055dc43 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -200,6 +200,17 @@ def test_series_construct_nan(): pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_construct_scalar_w_bf_index(): + bf_result = series.Series( + "hello", index=bigframes.pandas.Index([1, 2, 3]) + ).to_pandas() + pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64")) + + pd_result = pd_result.astype("string[pyarrow]") + + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_series_construct_from_list_escaped_strings(): """Check that special characters are supported.""" strings = [