googleapis · tswast · May 2, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
@@ -17,6 +17,8 @@
 from bigframes._config import option_context, options
 from bigframes._config.bigquery_options import BigQueryOptions
 from bigframes.core.global_session import close_session, get_global_session
+import bigframes.enums as enums
+import bigframes.exceptions as exceptions
 from bigframes.session import connect, Session
 from bigframes.version import __version__
 
@@ -25,6 +27,8 @@
     "BigQueryOptions",
     "get_global_session",
     "close_session",
+    "enums",
+    "exceptions",
     "connect",
     "Session",
     "__version__",

@@ -116,10 +116,20 @@ def __init__(
                 raise ValueError(
                     f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length"
                 )
+
+        # If no index columns are set, create one.
+        #
+        # Note: get_index_cols_and_uniqueness in
+        # bigframes/session/_io/bigquery/read_gbq_table.py depends on this
+        # being as sequential integer index column. If this default behavior
+        # ever changes, please also update get_index_cols_and_uniqueness so
+        # that users who explicitly request a sequential integer index can
+        # still get one.
         if len(index_columns) == 0:
             new_index_col_id = guid.generate_guid()
             expr = expr.promote_offsets(new_index_col_id)
             index_columns = [new_index_col_id]
+
         self._index_columns = tuple(index_columns)
         # Index labels don't need complicated hierarchical access so can store as tuple
         self._index_labels = (

@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Public enums used across BigQuery DataFrames."""
+
+# NOTE: This module should not depend on any others in the package.
+
+
+import enum
+
+
+class DefaultIndexKind(enum.Enum):
+    """Sentinel values used to override default indexing behavior."""
+
+    #: Use consecutive integers as the index. This is ``0``, ``1``, ``2``, ...,
+    #: ``n - 3``, ``n - 2``, ``n - 1``, where ``n`` is the number of items in
+    #: the index.
+    SEQUENTIAL_INT64 = enum.auto()
@@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Public exceptions and warnings used across BigQuery DataFrames."""
+
+# NOTE: This module should not depend on any others in the package.
+
 
 class UnknownLocationWarning(Warning):
     """The location is set to an unknown value."""
+
+
+class NoDefaultIndexError(ValueError):
+    """Unable to create a default index."""
@@ -63,6 +63,7 @@
 import bigframes.core.reshape
 import bigframes.core.tools
 import bigframes.dataframe
+import bigframes.enums
 import bigframes.operations as ops
 import bigframes.series
 import bigframes.session
@@ -423,7 +424,13 @@ def read_csv(
         Union[MutableSequence[Any], numpy.ndarray[Any, Any], Tuple[Any, ...], range]
     ] = None,
     index_col: Optional[
-        Union[int, str, Sequence[Union[str, int]], Literal[False]]
+        Union[
+            int,
+            str,
+            Sequence[Union[str, int]],
+            bigframes.enums.DefaultIndexKind,
+            Literal[False],
+        ]
     ] = None,
     usecols: Optional[
         Union[
@@ -491,7 +498,7 @@ def read_json(
 def read_gbq(
     query_or_table: str,
     *,
-    index_col: Iterable[str] | str = (),
+    index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
     columns: Iterable[str] = (),
     configuration: Optional[Dict] = None,
     max_results: Optional[int] = None,
@@ -529,7 +536,7 @@ def read_gbq_model(model_name: str):
 def read_gbq_query(
     query: str,
     *,
-    index_col: Iterable[str] | str = (),
+    index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
     columns: Iterable[str] = (),
     configuration: Optional[Dict] = None,
     max_results: Optional[int] = None,
@@ -555,7 +562,7 @@ def read_gbq_query(
 def read_gbq_table(
     query: str,
     *,
-    index_col: Iterable[str] | str = (),
+    index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
     columns: Iterable[str] = (),
     max_results: Optional[int] = None,
     filters: vendored_pandas_gbq.FiltersType = (),

@@ -294,7 +294,7 @@ def read_gbq(
         self,
         query_or_table: str,
         *,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
         configuration: Optional[Dict] = None,
         max_results: Optional[int] = None,
@@ -313,6 +313,9 @@ def read_gbq(
 
         filters = list(filters)
         if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table):
+            # TODO(b/338111344): This appears to be missing index_cols, which
+            # are necessary to be selected.
+            # TODO(b/338039517): Also, need to account for primary keys.
             query_or_table = self._to_query(query_or_table, columns, filters)
 
         if _is_query(query_or_table):
@@ -326,9 +329,6 @@ def read_gbq(
                 use_cache=use_cache,
             )
         else:
-            # TODO(swast): Query the snapshot table but mark it as a
-            # deterministic query so we can avoid serializing if we have a
-            # unique index.
             if configuration is not None:
                 raise ValueError(
                     "The 'configuration' argument is not allowed when "
@@ -359,6 +359,8 @@ def _to_query(
             else f"`{query_or_table}`"
         )
 
+        # TODO(b/338111344): Generate an index based on DefaultIndexKind if we
+        # don't have index columns specified.
         select_clause = "SELECT " + (
             ", ".join(f"`{column}`" for column in columns) if columns else "*"
         )
@@ -488,7 +490,7 @@ def read_gbq_query(
         self,
         query: str,
         *,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
         configuration: Optional[Dict] = None,
         max_results: Optional[int] = None,
@@ -566,7 +568,7 @@ def _read_gbq_query(
         self,
         query: str,
         *,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
         configuration: Optional[Dict] = None,
         max_results: Optional[int] = None,
@@ -598,7 +600,9 @@ def _read_gbq_query(
                 True if use_cache is None else use_cache
             )
 
-        if isinstance(index_col, str):
+        if isinstance(index_col, bigframes.enums.DefaultIndexKind):
+            index_cols = []
+        elif isinstance(index_col, str):
             index_cols = [index_col]
         else:
             index_cols = list(index_col)
@@ -628,7 +632,7 @@ def _read_gbq_query(
 
         return self.read_gbq_table(
             f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
-            index_col=index_cols,
+            index_col=index_col,
             columns=columns,
             max_results=max_results,
             use_cache=configuration["query"]["useQueryCache"],
@@ -638,7 +642,7 @@ def read_gbq_table(
         self,
         query: str,
         *,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
         max_results: Optional[int] = None,
         filters: third_party_pandas_gbq.FiltersType = (),
@@ -693,7 +697,7 @@ def _read_gbq_table(
         self,
         query: str,
         *,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
         max_results: Optional[int] = None,
         api_name: str,
@@ -821,10 +825,12 @@ def _read_bigquery_load_job(
         table: Union[bigquery.Table, bigquery.TableReference],
         *,
         job_config: bigquery.LoadJobConfig,
-        index_col: Iterable[str] | str = (),
+        index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
     ) -> dataframe.DataFrame:
-        if isinstance(index_col, str):
+        if isinstance(index_col, bigframes.enums.DefaultIndexKind):
+            index_cols = []
+        elif isinstance(index_col, str):
             index_cols = [index_col]
         else:
             index_cols = list(index_col)
@@ -1113,7 +1119,13 @@ def read_csv(
             Union[MutableSequence[Any], np.ndarray[Any, Any], Tuple[Any, ...], range]
         ] = None,
         index_col: Optional[
-            Union[int, str, Sequence[Union[str, int]], Literal[False]]
+            Union[
+                int,
+                str,
+                Sequence[Union[str, int]],
+                bigframes.enums.DefaultIndexKind,
+                Literal[False],
+            ]
         ] = None,
         usecols: Optional[
             Union[
@@ -1143,18 +1155,37 @@ def read_csv(
                     f"{constants.FEEDBACK_LINK}"
                 )
 
-            if index_col is not None and (
-                not index_col or not isinstance(index_col, str)
+            # TODO(b/338089659): Looks like we can relax this 1 column
+            # restriction if we check the contents of an iterable are strings
+            # not integers.
+            if (
+                # Empty tuples, None, and False are allowed and falsey.
+                index_col
+                and not isinstance(index_col, bigframes.enums.DefaultIndexKind)
+                and not isinstance(index_col, str)
             ):
                 raise NotImplementedError(
-                    "BigQuery engine only supports a single column name for `index_col`. "
-                    f"{constants.FEEDBACK_LINK}"
+                    "BigQuery engine only supports a single column name for `index_col`, "
+                    f"got: {repr(index_col)}. {constants.FEEDBACK_LINK}"
                 )
 
-            # None value for index_col cannot be passed to read_gbq
-            if index_col is None:
+            # None and False cannot be passed to read_gbq.
+            # TODO(b/338400133): When index_col is None, we should be using the
+            # first column of the CSV as the index to be compatible with the
+            # pandas engine. According to the pandas docs, only "False"
+            # indicates a default sequential index.
+            if not index_col:
                 index_col = ()
 
+            index_col = typing.cast(
+                Union[
+                    Sequence[str],  # Falsey values
+                    bigframes.enums.DefaultIndexKind,
+                    str,
+                ],
+                index_col,
+            )
+
             # usecols should only be an iterable of strings (column names) for use as columns in read_gbq.
             columns: Tuple[Any, ...] = tuple()
             if usecols is not None:
@@ -1199,6 +1230,11 @@ def read_csv(
                 columns=columns,
             )
         else:
+            if isinstance(index_col, bigframes.enums.DefaultIndexKind):
+                raise NotImplementedError(
+                    f"With index_col={repr(index_col)}, only engine='bigquery' is supported. "
+                    f"{constants.FEEDBACK_LINK}"
+                )
             if any(arg in kwargs for arg in ("chunksize", "iterator")):
                 raise NotImplementedError(
                     "'chunksize' and 'iterator' arguments are not supported. "