Skip to content

Commit 8ec6079

Browse files
fix: rename columns with protected names such as _TABLE_SUFFIX in to_gbq() (#1691)
* perf: defer query in `read_gbq` with wildcard tables * remove obsolete comments * use sql node instead of ibis table node to keep select * from omitting pseudocolumns Fixes this code sample: import bigframes.pandas as bpd df = bpd.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*") df[df["_TABLE_SUFFIX"] == "20161204"].peek() * test with cache and to_gbq * rename columns before caching * remove unnecessary comment * add missing import * do not materialize _TABLE_SUFFIX * fix unit tests * correct number of columns in cache with offsets * fix formatting * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * revert datetime change, max_results change * add pseudocolumns to node * fix unit tests * actually fix unit tests * try to rename as part of compile * use correct node for table schema * revert pseudocolumn addition * add tests for fix for invalid columns * revert cached changes --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 1ed9d46 commit 8ec6079

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

bigframes/core/utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
147147
elif identifier[0].isdigit():
148148
# first character must be letter or underscore
149149
identifier = "_" + identifier
150+
151+
# Except in special circumstances (true anonymous query results tables),
152+
# field names are not allowed to start with these (case-insensitive)
153+
# prefixes.
154+
# _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
155+
if any(
156+
identifier.casefold().startswith(invalid_prefix.casefold())
157+
for invalid_prefix in (
158+
"_PARTITION",
159+
"_TABLE_",
160+
"_FILE_",
161+
"_ROW_TIMESTAMP",
162+
"__ROOT__",
163+
"_COLIDENTIFIER",
164+
)
165+
):
166+
# Remove leading _ character(s) to avoid collisions with preserved
167+
# prefixes.
168+
identifier = re.sub("^_+", "", identifier)
169+
150170
return identifier
151171

152172

tests/system/small/test_dataframe_io.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,74 @@ def test_to_gbq_w_duplicate_column_names(
552552
)
553553

554554

555+
def test_to_gbq_w_protected_column_names(
556+
scalars_df_index, scalars_pandas_df_index, dataset_id
557+
):
558+
"""
559+
Column names can't use any of the following prefixes:
560+
561+
* _TABLE_
562+
* _FILE_
563+
* _PARTITION
564+
* _ROW_TIMESTAMP
565+
* __ROOT__
566+
* _COLIDENTIFIER
567+
568+
See: https://cloud.google.com/bigquery/docs/schemas#column_names
569+
"""
570+
destination_table = f"{dataset_id}.test_to_gbq_w_protected_column_names"
571+
572+
scalars_df_index = scalars_df_index.rename(
573+
columns={
574+
"bool_col": "_Table_Suffix",
575+
"bytes_col": "_file_path",
576+
"date_col": "_PARTITIONDATE",
577+
"datetime_col": "_ROW_TIMESTAMP",
578+
"int64_col": "__ROOT__",
579+
"int64_too": "_COLIDENTIFIER",
580+
"numeric_col": "COLIDENTIFIER", # Create a collision at serialization time.
581+
}
582+
)[
583+
[
584+
"_Table_Suffix",
585+
"_file_path",
586+
"_PARTITIONDATE",
587+
"_ROW_TIMESTAMP",
588+
"__ROOT__",
589+
"_COLIDENTIFIER",
590+
"COLIDENTIFIER",
591+
]
592+
]
593+
scalars_df_index.to_gbq(destination_table, if_exists="replace")
594+
595+
bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
596+
597+
# Leading _ characters are removed to make these columns valid in BigQuery.
598+
expected = scalars_pandas_df_index.rename(
599+
columns={
600+
"bool_col": "Table_Suffix",
601+
"bytes_col": "file_path",
602+
"date_col": "PARTITIONDATE",
603+
"datetime_col": "ROW_TIMESTAMP",
604+
"int64_col": "ROOT__",
605+
"int64_too": "COLIDENTIFIER",
606+
"numeric_col": "COLIDENTIFIER_1",
607+
}
608+
)[
609+
[
610+
"Table_Suffix",
611+
"file_path",
612+
"PARTITIONDATE",
613+
"ROW_TIMESTAMP",
614+
"ROOT__",
615+
"COLIDENTIFIER",
616+
"COLIDENTIFIER_1",
617+
]
618+
]
619+
620+
pd.testing.assert_frame_equal(bf_result, expected)
621+
622+
555623
def test_to_gbq_w_flexible_column_names(
556624
scalars_df_index, dataset_id: str, bigquery_client
557625
):

0 commit comments

Comments
 (0)