Skip to content

fix: rename columns with protected names such as _TABLE_SUFFIX in to_gbq() #1691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a9edb2a
perf: defer query in `read_gbq` with wildcard tables
tswast Apr 27, 2025
df795b1
remove obsolete comments
tswast Apr 27, 2025
f81fe4e
Merge remote-tracking branch 'origin/main' into b405773140-wildcard
tswast Apr 28, 2025
79f4c58
use sql node instead of ibis table node to keep select * from omittin…
tswast Apr 28, 2025
5b0d0a0
test with cache and to_gbq
tswast Apr 29, 2025
118964b
rename columns before caching
tswast Apr 29, 2025
ca33463
remove unnecessary comment
tswast Apr 29, 2025
e546745
Merge remote-tracking branch 'origin/main' into b405773140-wildcard
tswast Apr 29, 2025
4897ca4
add missing import
tswast Apr 29, 2025
e1a7341
do not materialize _TABLE_SUFFIX
tswast Apr 29, 2025
af06200
fix unit tests
tswast Apr 29, 2025
af5c036
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
f26574b
correct number of columns in cache with offsets
tswast Apr 29, 2025
dd05c2d
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
ab0e50a
fix formatting
tswast Apr 29, 2025
89535e2
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Apr 29, 2025
8bb09d5
Merge branch 'b405773140-wildcard' of https://github.com/googleapis/p…
gcf-owl-bot[bot] Apr 29, 2025
40e2e77
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
d37bf5e
revert datetime change, max_results change
tswast Apr 29, 2025
2f25f8d
Merge remote-tracking branch 'origin/b405773140-wildcard' into b40577…
tswast Apr 29, 2025
4bf66b6
add pseudocolumns to node
tswast Apr 29, 2025
8c96498
fix unit tests
tswast Apr 29, 2025
e1780a6
actually fix unit tests
tswast Apr 29, 2025
b027b51
try to rename as part of compile
tswast Apr 29, 2025
00fbd91
add renames to as cached table
tswast Apr 30, 2025
9a778db
use correct node for table schema
tswast Apr 30, 2025
d076cd3
Merge branch 'main' into b405773140-wildcard
tswast Apr 30, 2025
7d8ddcc
Merge remote-tracking branch 'origin/main' into b405773140-pseudocolumns
tswast May 5, 2025
0722229
revert pseudocolumn addition
tswast May 5, 2025
340645d
add tests for fix for invalid columns
tswast May 5, 2025
4f96c4c
Merge remote-tracking branch 'origin/main' into b405773140-fix-protec…
tswast May 5, 2025
f8745be
revert cached changes
tswast May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions bigframes/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
elif identifier[0].isdigit():
# first character must be letter or underscore
identifier = "_" + identifier

# Except in special circumstances (true anonymous query results tables),
# field names are not allowed to start with these (case-insensitive)
# prefixes.
# _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
if any(
identifier.casefold().startswith(invalid_prefix.casefold())
for invalid_prefix in (
"_PARTITION",
"_TABLE_",
"_FILE_",
"_ROW_TIMESTAMP",
"__ROOT__",
"_COLIDENTIFIER",
)
):
# Remove leading _ character(s) to avoid collisions with preserved
# prefixes.
identifier = re.sub("^_+", "", identifier)

return identifier


Expand Down
68 changes: 68 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,74 @@ def test_to_gbq_w_duplicate_column_names(
)


def test_to_gbq_w_protected_column_names(
scalars_df_index, scalars_pandas_df_index, dataset_id
):
"""
Column names can't use any of the following prefixes:

* _TABLE_
* _FILE_
* _PARTITION
* _ROW_TIMESTAMP
* __ROOT__
* _COLIDENTIFIER

See: https://cloud.google.com/bigquery/docs/schemas#column_names
"""
destination_table = f"{dataset_id}.test_to_gbq_w_protected_column_names"

scalars_df_index = scalars_df_index.rename(
columns={
"bool_col": "_Table_Suffix",
"bytes_col": "_file_path",
"date_col": "_PARTITIONDATE",
"datetime_col": "_ROW_TIMESTAMP",
"int64_col": "__ROOT__",
"int64_too": "_COLIDENTIFIER",
"numeric_col": "COLIDENTIFIER", # Create a collision at serialization time.
}
)[
[
"_Table_Suffix",
"_file_path",
"_PARTITIONDATE",
"_ROW_TIMESTAMP",
"__ROOT__",
"_COLIDENTIFIER",
"COLIDENTIFIER",
]
]
scalars_df_index.to_gbq(destination_table, if_exists="replace")

bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()

# Leading _ characters are removed to make these columns valid in BigQuery.
expected = scalars_pandas_df_index.rename(
columns={
"bool_col": "Table_Suffix",
"bytes_col": "file_path",
"date_col": "PARTITIONDATE",
"datetime_col": "ROW_TIMESTAMP",
"int64_col": "ROOT__",
"int64_too": "COLIDENTIFIER",
"numeric_col": "COLIDENTIFIER_1",
}
)[
[
"Table_Suffix",
"file_path",
"PARTITIONDATE",
"ROW_TIMESTAMP",
"ROOT__",
"COLIDENTIFIER",
"COLIDENTIFIER_1",
]
]

pd.testing.assert_frame_equal(bf_result, expected)


def test_to_gbq_w_flexible_column_names(
scalars_df_index, dataset_id: str, bigquery_client
):
Expand Down