From 664fb410e3a2d02172ac428d631a49e4d62c0332 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 20 Mar 2025 17:49:23 +0000
Subject: [PATCH 1/3] fix: Local data always has sequential index

---
 bigframes/core/indexes/base.py    | 11 ++---
 bigframes/dataframe.py            |  2 +-
 bigframes/operations/base.py      | 74 ++++++++++++-------------------
 tests/system/small/test_series.py |  9 +++-
 4 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
index 900825996e..71dc914ed4 100644
--- a/bigframes/core/indexes/base.py
+++ b/bigframes/core/indexes/base.py
@@ -70,9 +70,7 @@ def __new__(
         elif isinstance(data, series.Series) or isinstance(data, Index):
             if isinstance(data, series.Series):
                 block = data._block
-                block = block.set_index(
-                    col_ids=[data._value_column],
-                )
+                block = block.set_index(col_ids=[data._value_column])
             elif isinstance(data, Index):
                 block = data._block
             index = Index(data=block)
@@ -493,7 +491,7 @@ def __getitem__(self, key: int) -> typing.Any:
             raise NotImplementedError(f"Index key not supported {key}")
 
     @overload
-    def to_pandas(
+    def to_pandas(  # type: ignore[overload-overlap]
         self,
         *,
         allow_large_results: Optional[bool] = ...,
@@ -508,7 +506,10 @@ def to_pandas(
         ...
 
     def to_pandas(
-        self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False
+        self,
+        *,
+        allow_large_results: Optional[bool] = None,
+        dry_run: bool = False,
     ) -> pandas.Index | pandas.Series:
         """Gets the Index as a pandas Index.
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 38f663f56f..4955bb1295 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1596,7 +1596,7 @@ def to_arrow(
         return pa_table
 
     @overload
-    def to_pandas(
+    def to_pandas(  # type: ignore[overload-overlap]
         self,
         max_download_size: Optional[int] = ...,
         sampling_method: Optional[str] = ...,
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index 75db2f48e9..46ca47450f 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -69,40 +69,12 @@ def __init__(
             raise ValueError(
                 f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}"
             )
+
         if isinstance(data, blocks.Block):
-            # Constructing from block is for internal use only - shouldn't use parameters, block encompasses all state
-            assert len(data.value_columns) == 1
-            assert len(data.column_labels) == 1
-            assert index is None
-            assert name is None
-            assert dtype is None
             block = data
-
-        # interpret these cases as both index and data
-        elif isinstance(data, bigframes.pandas.Series) or pd.api.types.is_dict_like(
-            data
-        ):  # includes pd.Series
-            if isinstance(data, bigframes.pandas.Series):
-                data = data.copy()
-                if name is not None:
-                    data.name = name
-                if dtype is not None:
-                    bf_dtype = bigframes.dtypes.bigframes_type(dtype)
-                    data = data.astype(bf_dtype)
-            else:  # local dict-like data
-                data = read_pandas_func(pd.Series(data, name=name, dtype=dtype))  # type: ignore
-            data_block = data._block
-            if index is not None:
-                # reindex
-                bf_index = indexes.Index(index, session=session)
-                idx_block = bf_index._block
-                idx_cols = idx_block.value_columns
-                block_idx, _ = idx_block.join(data_block, how="left")
-                data_block = block_idx.with_index_labels(bf_index.names)
-            block = data_block
-
-        # list-like data that will get default index
-        elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data):
+        elif isinstance(data, SeriesMethods):
+            block = data._get_block()
+        elif isinstance(data, indexes.Index):
             data = indexes.Index(data, dtype=dtype, name=name, session=session)
             # set to none as it has already been applied, avoid re-cast later
             if data.nlevels != 1:
@@ -111,8 +83,7 @@ def __init__(
             data_block = data._block.reset_index(drop=False).with_column_labels(
                 data.names
             )
-            if index is not None:
-                # Align by offset
+            if index is not None:  # Align data and index by offset
                 bf_index = indexes.Index(index, session=session)
                 idx_block = bf_index._block.reset_index(
                     drop=False
@@ -121,19 +92,32 @@ def __init__(
                 data_block, (l_mapping, _) = idx_block.join(data_block, how="left")
                 data_block = data_block.set_index([l_mapping[col] for col in idx_cols])
                 data_block = data_block.with_index_labels(bf_index.names)
+                # prevents no-op reindex later
+                index = None
             block = data_block
 
-        else:  # Scalar case
-            if index is not None:
-                bf_index = indexes.Index(index, session=session)
-            else:
-                bf_index = indexes.Index(
-                    [] if (data is None) else [0],
-                    session=session,
-                    dtype=bigframes.dtypes.INT_DTYPE,
-                )
-            block, _ = bf_index._block.create_constant(data, dtype)
-            block = block.with_column_labels([name])
+        if block:
+            assert len(block.value_columns) == 1
+            assert len(block.column_labels) == 1
+            if index is not None:  # reindexing operation
+                bf_index = indexes.Index(index)
+                idx_block = bf_index._block
+                idx_cols = idx_block.index_columns
+                block, _ = idx_block.join(block, how="left")
+                block = block.with_index_labels(bf_index.names)
+            if name:
+                block = block.with_column_labels([name])
+            if dtype:
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
+        else:
+            pd_series = pd.Series(
+                data=data,
+                index=index,  # type:ignore
+                dtype=dtype,  # type:ignore
+                name=name,
+            )
+            block = read_pandas_func(pd_series)._get_block()  # type:ignore
 
         assert block is not None
         self._block: blocks.Block = block
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index acd267aaf8..9bce47c20a 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -140,7 +140,6 @@ def test_series_construct_reindex():
 
     # BigQuery DataFrame default indices use nullable Int64 always
     pd_result.index = pd_result.index.astype("Int64")
-
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
@@ -303,6 +302,14 @@ def test_series_construct_w_dtype_for_array_struct():
     )
 
 
+def test_series_construct_local_unordered_has_sequential_index(unordered_session):
+    series = bigframes.pandas.Series(
+        ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session
+    )
+    expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype())
+    pd.testing.assert_index_equal(series.index.to_pandas(), expected)
+
+
 def test_series_construct_w_dtype_for_json():
     # Until b/401630655 is resolved, json, not compatible with allow_large_results=False
     with bigframes.option_context("bigquery.allow_large_results", True):

From 8ad4aa73451a47c9b6ef587d763106950dd12489 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 20 Mar 2025 20:00:25 +0000
Subject: [PATCH 2/3] handle remote index local data case

---
 bigframes/operations/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index 46ca47450f..5307595a7b 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -74,7 +74,7 @@ def __init__(
             block = data
         elif isinstance(data, SeriesMethods):
             block = data._get_block()
-        elif isinstance(data, indexes.Index):
+        elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index):
             data = indexes.Index(data, dtype=dtype, name=name, session=session)
             # set to none as it has already been applied, avoid re-cast later
             if data.nlevels != 1:

From 8b614ac92d7a2f976430e9296727e573b61c05ae Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 26 Mar 2025 17:59:12 +0000
Subject: [PATCH 3/3] add data scalar bigframes.Index index case

---
 bigframes/operations/base.py      | 10 ++++++++++
 tests/system/small/test_series.py | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index 5307595a7b..8d70596b7d 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -27,6 +27,7 @@
 import bigframes.core.identifiers as ids
 import bigframes.core.indexes as indexes
 import bigframes.core.scalar as scalars
+import bigframes.core.utils as bf_utils
 import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -74,6 +75,15 @@ def __init__(
             block = data
         elif isinstance(data, SeriesMethods):
             block = data._get_block()
+        # special case where data is local scalar, but index is bigframes index (maybe very big)
+        elif (
+            not bf_utils.is_list_like(data) and not isinstance(data, indexes.Index)
+        ) and isinstance(index, indexes.Index):
+            block = index._block
+            block, _ = block.create_constant(data)
+            block = block.with_column_labels([None])
+            # prevents no-op reindex later
+            index = None
         elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index):
             data = indexes.Index(data, dtype=dtype, name=name, session=session)
             # set to none as it has already been applied, avoid re-cast later
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 9bce47c20a..5ca055dc43 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -200,6 +200,17 @@ def test_series_construct_nan():
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
+def test_series_construct_scalar_w_bf_index():
+    bf_result = series.Series(
+        "hello", index=bigframes.pandas.Index([1, 2, 3])
+    ).to_pandas()
+    pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64"))
+
+    pd_result = pd_result.astype("string[pyarrow]")
+
+    pd.testing.assert_series_equal(bf_result, pd_result)
+
+
 def test_series_construct_from_list_escaped_strings():
     """Check that special characters are supported."""
     strings = [