googleapis · tswast · May 6, 2025 · Apr 27, 2025 · Apr 27, 2025 · Apr 28, 2025
@@ -147,6 +147,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
         elif identifier[0].isdigit():
             # first character must be letter or underscore
             identifier = "_" + identifier
+
+    # Except in special circumstances (true anonymous query results tables),
+    # field names are not allowed to start with these (case-insensitive)
+    # prefixes.
+    # _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
+    if any(
+        identifier.casefold().startswith(invalid_prefix.casefold())
+        for invalid_prefix in (
+            "_PARTITION",
+            "_TABLE_",
+            "_FILE_",
+            "_ROW_TIMESTAMP",
+            "__ROOT__",
+            "_COLIDENTIFIER",
+        )
+    ):
+        # Remove leading _ character(s) to avoid collisions with preserved
+        # prefixes.
+        identifier = re.sub("^_+", "", identifier)
+
     return identifier
 
 

@@ -552,6 +552,74 @@ def test_to_gbq_w_duplicate_column_names(
     )
 
 
+def test_to_gbq_w_protected_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """
+    Column names can't use any of the following prefixes:
+
+    * _TABLE_
+    * _FILE_
+    * _PARTITION
+    * _ROW_TIMESTAMP
+    * __ROOT__
+    * _COLIDENTIFIER
+
+    See: https://cloud.google.com/bigquery/docs/schemas#column_names
+    """
+    destination_table = f"{dataset_id}.test_to_gbq_w_protected_column_names"
+
+    scalars_df_index = scalars_df_index.rename(
+        columns={
+            "bool_col": "_Table_Suffix",
+            "bytes_col": "_file_path",
+            "date_col": "_PARTITIONDATE",
+            "datetime_col": "_ROW_TIMESTAMP",
+            "int64_col": "__ROOT__",
+            "int64_too": "_COLIDENTIFIER",
+            "numeric_col": "COLIDENTIFIER",  # Create a collision at serialization time.
+        }
+    )[
+        [
+            "_Table_Suffix",
+            "_file_path",
+            "_PARTITIONDATE",
+            "_ROW_TIMESTAMP",
+            "__ROOT__",
+            "_COLIDENTIFIER",
+            "COLIDENTIFIER",
+        ]
+    ]
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    # Leading _ characters are removed to make these columns valid in BigQuery.
+    expected = scalars_pandas_df_index.rename(
+        columns={
+            "bool_col": "Table_Suffix",
+            "bytes_col": "file_path",
+            "date_col": "PARTITIONDATE",
+            "datetime_col": "ROW_TIMESTAMP",
+            "int64_col": "ROOT__",
+            "int64_too": "COLIDENTIFIER",
+            "numeric_col": "COLIDENTIFIER_1",
+        }
+    )[
+        [
+            "Table_Suffix",
+            "file_path",
+            "PARTITIONDATE",
+            "ROW_TIMESTAMP",
+            "ROOT__",
+            "COLIDENTIFIER",
+            "COLIDENTIFIER_1",
+        ]
+    ]
+
+    pd.testing.assert_frame_equal(bf_result, expected)
+
+
 def test_to_gbq_w_flexible_column_names(
     scalars_df_index, dataset_id: str, bigquery_client
 ):