Skip to content

Commit c260fc8

Browse files
authored
chore: add pyformat_args to _read_gbq_colab (#1704)
* chore: add private _read_gbq_colab method that uses partial ordering mode, disables progress bars, disables default index, and communicates via callbacks * add colab read gbq * add test for ordering * add ordered argument to to_pandas_batches * add unit test looking for job labels * remove ordered option for to_pandas_batches * ignore type for mock job configs * chore: add pyformat_args to _read_gbq_colab * fix unit tests * add test for _read_gbq_colab * escape strings * fix null support
1 parent f5d91f3 commit c260fc8

File tree

5 files changed

+362
-5
lines changed

5 files changed

+362
-5
lines changed

bigframes/core/pyformat.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Helpers for the pyformat feature."""
16+
17+
# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See:
18+
# https://github.com/googleapis/python-bigquery-magics/blob/main/bigquery_magics/pyformat.py
19+
20+
from __future__ import annotations
21+
22+
import string
23+
import typing
24+
from typing import Any, Union
25+
26+
import google.cloud.bigquery
27+
import google.cloud.bigquery.table
28+
29+
_BQ_TABLE_TYPES = Union[
30+
google.cloud.bigquery.Table,
31+
google.cloud.bigquery.TableReference,
32+
google.cloud.bigquery.table.TableListItem,
33+
]
34+
35+
36+
def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
37+
return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
38+
39+
40+
def _field_to_template_value(name: str, value: Any) -> str:
41+
"""Convert value to something embeddable in a SQL string."""
42+
import bigframes.core.sql # Avoid circular imports
43+
44+
_validate_type(name, value)
45+
46+
table_types = typing.get_args(_BQ_TABLE_TYPES)
47+
if isinstance(value, table_types):
48+
return _table_to_sql(value)
49+
50+
# TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery.
51+
return bigframes.core.sql.simple_literal(value)
52+
53+
54+
def _validate_type(name: str, value: Any):
55+
"""Raises TypeError if value is unsupported."""
56+
import bigframes.core.sql # Avoid circular imports
57+
58+
if value is None:
59+
return # None can't be used in isinstance, but is a valid literal.
60+
61+
supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args(
62+
bigframes.core.sql.SIMPLE_LITERAL_TYPES
63+
)
64+
if not isinstance(value, supported_types):
65+
raise TypeError(
66+
f"{name} has unsupported type: {type(value)}. "
67+
f"Only {supported_types} are supported."
68+
)
69+
70+
71+
def _parse_fields(sql_template: str) -> list[str]:
72+
return [
73+
field_name
74+
for _, field_name, _, _ in string.Formatter().parse(sql_template)
75+
if field_name is not None
76+
]
77+
78+
79+
def pyformat(
80+
sql_template: str,
81+
*,
82+
pyformat_args: dict,
83+
# TODO: add dry_run parameter to avoid expensive API calls in conversion
84+
# TODO: and session to upload data / convert to table if necessary
85+
) -> str:
86+
"""Unsafe Python-style string formatting of SQL string.
87+
88+
Only some data types supported.
89+
90+
Warning: strings are **not** escaped. This allows them to be used in
91+
contexts such as table identifiers, where normal query parameters are not
92+
supported.
93+
94+
Args:
95+
sql_template (str):
96+
SQL string with 0+ {var_name}-style format options.
97+
pyformat_args (dict):
98+
Variable namespace to use for formatting.
99+
100+
Raises:
101+
TypeError: if a referenced variable is not of a supported type.
102+
KeyError: if a referenced variable is not found.
103+
"""
104+
fields = _parse_fields(sql_template)
105+
106+
format_kwargs = {}
107+
for name in fields:
108+
value = pyformat_args[name]
109+
format_kwargs[name] = _field_to_template_value(name, value)
110+
111+
return sql_template.format(**format_kwargs)

bigframes/core/sql.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,22 @@
4242
to_wkt = dumps
4343

4444

45+
SIMPLE_LITERAL_TYPES = Union[
46+
bytes,
47+
str,
48+
int,
49+
bool,
50+
float,
51+
datetime.datetime,
52+
datetime.date,
53+
datetime.time,
54+
decimal.Decimal,
55+
list,
56+
]
57+
58+
4559
### Writing SQL Values (literals, column references, table references, etc.)
46-
def simple_literal(
47-
value: bytes | str | int | bool | float | datetime.datetime | list | None,
48-
):
60+
def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str:
4961
"""Return quoted input string."""
5062

5163
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals

bigframes/session/__init__.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import bigframes._config.bigquery_options as bigquery_options
6262
import bigframes.clients
6363
from bigframes.core import blocks
64+
import bigframes.core.pyformat
6465

6566
# Even though the ibis.backends.bigquery import is unused, it's needed
6667
# to register new and replacement ops with the Ibis BigQuery backend.
@@ -480,16 +481,38 @@ def _read_gbq_colab(
480481
self,
481482
query: str,
482483
# TODO: Add a callback parameter that takes some kind of Event object.
483-
# TODO: Add parameter for variables for string formatting.
484484
# TODO: Add dry_run parameter.
485+
*,
486+
pyformat_args: Optional[Dict[str, Any]] = None,
485487
) -> dataframe.DataFrame:
486488
"""A version of read_gbq that has the necessary default values for use in colab integrations.
487489
488490
This includes, no ordering, no index, no progress bar, always use string
489491
formatting for embedding local variables / dataframes.
492+
493+
Args:
494+
query (str):
495+
A SQL query string to execute. Results (if any) are turned into
496+
a DataFrame.
497+
pyformat_args (dict):
498+
A dictionary of potential variables to replace in ``query``.
499+
Note: strings are _not_ escaped. Use query parameters for these,
500+
instead. Note: unlike read_gbq / read_gbq_query, even if set to
501+
None, this function always assumes {var} refers to a variable
502+
that is supposed to be supplied in this dictionary.
490503
"""
504+
# TODO: Allow for a table ID to avoid queries like with read_gbq?
505+
506+
if pyformat_args is None:
507+
pyformat_args = {}
508+
509+
# TODO: move this to read_gbq_query if/when we expose this feature
510+
# beyond in _read_gbq_colab.
511+
query = bigframes.core.pyformat.pyformat(
512+
query,
513+
pyformat_args=pyformat_args,
514+
)
491515

492-
# TODO: Allow for a table ID to avoid queries like read_gbq?
493516
return self._loader.read_gbq_query(
494517
query=query,
495518
index_col=bigframes.enums.DefaultIndexKind.NULL,

tests/system/small/session/test_read_gbq_colab.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414

1515
"""System tests for read_gbq_colab helper functions."""
1616

17+
import pandas
18+
import pandas.testing
19+
1720

1821
def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session):
1922
df = maybe_ordered_session._read_gbq_colab(
@@ -39,3 +42,66 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi
3942
total_rows += len(batch.index)
4043

4144
assert total_rows > 0
45+
46+
47+
def test_read_gbq_colab_includes_formatted_scalars(session):
48+
pyformat_args = {
49+
"some_integer": 123,
50+
"some_string": "This could be dangerous, but we esape it",
51+
# This is not a supported type, but ignored if not referenced.
52+
"some_object": object(),
53+
}
54+
df = session._read_gbq_colab(
55+
"""
56+
SELECT {some_integer} as some_integer,
57+
{some_string} as some_string,
58+
'{{escaped}}' as escaped
59+
""",
60+
pyformat_args=pyformat_args,
61+
)
62+
result = df.to_pandas()
63+
pandas.testing.assert_frame_equal(
64+
result,
65+
pandas.DataFrame(
66+
{
67+
"some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
68+
"some_string": pandas.Series(
69+
["This could be dangerous, but we esape it"],
70+
dtype="string[pyarrow]",
71+
),
72+
"escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
73+
}
74+
),
75+
)
76+
77+
78+
def test_read_gbq_colab_includes_formatted_bigframes_dataframe(session):
79+
pyformat_args = {
80+
# TODO: put a bigframes DataFrame here.
81+
"some_integer": 123,
82+
"some_string": "This could be dangerous, but we esape it",
83+
# This is not a supported type, but ignored if not referenced.
84+
"some_object": object(),
85+
}
86+
df = session._read_gbq_colab(
87+
"""
88+
SELECT {some_integer} as some_integer,
89+
{some_string} as some_string,
90+
'{{escaped}}' as escaped
91+
""",
92+
pyformat_args=pyformat_args,
93+
)
94+
result = df.to_pandas()
95+
pandas.testing.assert_frame_equal(
96+
result,
97+
pandas.DataFrame(
98+
{
99+
"some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
100+
"some_string": pandas.Series(
101+
["This could be dangerous, but we esape it"],
102+
dtype="string[pyarrow]",
103+
),
104+
"escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
105+
}
106+
),
107+
)

0 commit comments

Comments
 (0)