diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 495523d2fc..ba3fdcfd4b 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -147,6 +147,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: elif identifier[0].isdigit(): # first character must be letter or underscore identifier = "_" + identifier + + # Except in special circumstances (true anonymous query results tables), + # field names are not allowed to start with these (case-insensitive) + # prefixes. + # _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER + if any( + identifier.casefold().startswith(invalid_prefix.casefold()) + for invalid_prefix in ( + "_PARTITION", + "_TABLE_", + "_FILE_", + "_ROW_TIMESTAMP", + "__ROOT__", + "_COLIDENTIFIER", + ) + ): + # Remove leading _ character(s) to avoid collisions with preserved + # prefixes. + identifier = re.sub("^_+", "", identifier) + return identifier diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index e12db3f598..e210fed522 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -552,6 +552,74 @@ def test_to_gbq_w_duplicate_column_names( ) +def test_to_gbq_w_protected_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """ + Column names can't use any of the following prefixes: + + * _TABLE_ + * _FILE_ + * _PARTITION + * _ROW_TIMESTAMP + * __ROOT__ + * _COLIDENTIFIER + + See: https://cloud.google.com/bigquery/docs/schemas#column_names + """ + destination_table = f"{dataset_id}.test_to_gbq_w_protected_column_names" + + scalars_df_index = scalars_df_index.rename( + columns={ + "bool_col": "_Table_Suffix", + "bytes_col": "_file_path", + "date_col": "_PARTITIONDATE", + "datetime_col": "_ROW_TIMESTAMP", + "int64_col": "__ROOT__", + "int64_too": "_COLIDENTIFIER", + "numeric_col": "COLIDENTIFIER", # Create a collision at serialization time. + } + )[ + [ + "_Table_Suffix", + "_file_path", + "_PARTITIONDATE", + "_ROW_TIMESTAMP", + "__ROOT__", + "_COLIDENTIFIER", + "COLIDENTIFIER", + ] + ] + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + # Leading _ characters are removed to make these columns valid in BigQuery. + expected = scalars_pandas_df_index.rename( + columns={ + "bool_col": "Table_Suffix", + "bytes_col": "file_path", + "date_col": "PARTITIONDATE", + "datetime_col": "ROW_TIMESTAMP", + "int64_col": "ROOT__", + "int64_too": "COLIDENTIFIER", + "numeric_col": "COLIDENTIFIER_1", + } + )[ + [ + "Table_Suffix", + "file_path", + "PARTITIONDATE", + "ROW_TIMESTAMP", + "ROOT__", + "COLIDENTIFIER", + "COLIDENTIFIER_1", + ] + ] + + pd.testing.assert_frame_equal(bf_result, expected) + + def test_to_gbq_w_flexible_column_names( scalars_df_index, dataset_id: str, bigquery_client ):