fix: rename columns with protected names such as _TABLE_SUFFIX in to_gbq() (#1691)

tswast · gcf-owl-bot[bot] · web-flow · commit 8ec607986fd3 · 2025-05-06T12:29:00.000-05:00
* perf: defer query in `read_gbq` with wildcard tables * remove obsolete comments * use sql node instead of ibis table node to keep select * from omitting pseudocolumns Fixes this code sample: import bigframes.pandas as bpd df = bpd.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*") df[df["_TABLE_SUFFIX"] == "20161204"].peek() * test with cache and to_gbq * rename columns before caching * remove unnecessary comment * add missing import * do not materialize _TABLE_SUFFIX * fix unit tests * correct number of columns in cache with offsets * fix formatting * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * revert datetime change, max_results change * add pseudocolumns to node * fix unit tests * actually fix unit tests * try to rename as part of compile * use correct node for table schema * revert pseudocolumn addition * add tests for fix for invalid columns * revert cached changes --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -147,6 +147,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
         elif identifier[0].isdigit():
             # first character must be letter or underscore
             identifier = "_" + identifier
+
+    # Except in special circumstances (true anonymous query results tables),
+    # field names are not allowed to start with these (case-insensitive)
+    # prefixes.
+    # _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
+    if any(
+        identifier.casefold().startswith(invalid_prefix.casefold())
+        for invalid_prefix in (
+            "_PARTITION",
+            "_TABLE_",
+            "_FILE_",
+            "_ROW_TIMESTAMP",
+            "__ROOT__",
+            "_COLIDENTIFIER",
+        )
+    ):
+        # Remove leading _ character(s) to avoid collisions with preserved
+        # prefixes.
+        identifier = re.sub("^_+", "", identifier)
+
     return identifier
 
 
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -552,6 +552,74 @@ def test_to_gbq_w_duplicate_column_names(
     )
 
 
+def test_to_gbq_w_protected_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """
+    Column names can't use any of the following prefixes:
+
+    * _TABLE_
+    * _FILE_
+    * _PARTITION
+    * _ROW_TIMESTAMP
+    * __ROOT__
+    * _COLIDENTIFIER
+
+    See: https://cloud.google.com/bigquery/docs/schemas#column_names
+    """
+    destination_table = f"{dataset_id}.test_to_gbq_w_protected_column_names"
+
+    scalars_df_index = scalars_df_index.rename(
+        columns={
+            "bool_col": "_Table_Suffix",
+            "bytes_col": "_file_path",
+            "date_col": "_PARTITIONDATE",
+            "datetime_col": "_ROW_TIMESTAMP",
+            "int64_col": "__ROOT__",
+            "int64_too": "_COLIDENTIFIER",
+            "numeric_col": "COLIDENTIFIER",  # Create a collision at serialization time.
+        }
+    )[
+        [
+            "_Table_Suffix",
+            "_file_path",
+            "_PARTITIONDATE",
+            "_ROW_TIMESTAMP",
+            "__ROOT__",
+            "_COLIDENTIFIER",
+            "COLIDENTIFIER",
+        ]
+    ]
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    # Leading _ characters are removed to make these columns valid in BigQuery.
+    expected = scalars_pandas_df_index.rename(
+        columns={
+            "bool_col": "Table_Suffix",
+            "bytes_col": "file_path",
+            "date_col": "PARTITIONDATE",
+            "datetime_col": "ROW_TIMESTAMP",
+            "int64_col": "ROOT__",
+            "int64_too": "COLIDENTIFIER",
+            "numeric_col": "COLIDENTIFIER_1",
+        }
+    )[
+        [
+            "Table_Suffix",
+            "file_path",
+            "PARTITIONDATE",
+            "ROW_TIMESTAMP",
+            "ROOT__",
+            "COLIDENTIFIER",
+            "COLIDENTIFIER_1",
+        ]
+    ]
+
+    pd.testing.assert_frame_equal(bf_result, expected)
+
+
 def test_to_gbq_w_flexible_column_names(
     scalars_df_index, dataset_id: str, bigquery_client
 ):