Skip to content

chore: add pyformat_args to _read_gbq_colab #1704

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions bigframes/core/pyformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helpers for the pyformat feature."""

# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See:
# https://github.com/googleapis/python-bigquery-magics/blob/main/bigquery_magics/pyformat.py

from __future__ import annotations

import string
import typing
from typing import Any, Union

import google.cloud.bigquery
import google.cloud.bigquery.table

_BQ_TABLE_TYPES = Union[
google.cloud.bigquery.Table,
google.cloud.bigquery.TableReference,
google.cloud.bigquery.table.TableListItem,
]


def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"


def _field_to_template_value(name: str, value: Any) -> str:
"""Convert value to something embeddable in a SQL string."""
import bigframes.core.sql # Avoid circular imports

_validate_type(name, value)

table_types = typing.get_args(_BQ_TABLE_TYPES)
if isinstance(value, table_types):
return _table_to_sql(value)

# TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery.
return bigframes.core.sql.simple_literal(value)


def _validate_type(name: str, value: Any):
"""Raises TypeError if value is unsupported."""
import bigframes.core.sql # Avoid circular imports

if value is None:
return # None can't be used in isinstance, but is a valid literal.

supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args(
bigframes.core.sql.SIMPLE_LITERAL_TYPES
)
if not isinstance(value, supported_types):
raise TypeError(
f"{name} has unsupported type: {type(value)}. "
f"Only {supported_types} are supported."
)


def _parse_fields(sql_template: str) -> list[str]:
return [
field_name
for _, field_name, _, _ in string.Formatter().parse(sql_template)
if field_name is not None
]


def pyformat(
sql_template: str,
*,
pyformat_args: dict,
# TODO: add dry_run parameter to avoid expensive API calls in conversion
# TODO: and session to upload data / convert to table if necessary
) -> str:
"""Unsafe Python-style string formatting of SQL string.

Only some data types supported.

Warning: strings are **not** escaped. This allows them to be used in
contexts such as table identifiers, where normal query parameters are not
supported.

Args:
sql_template (str):
SQL string with 0+ {var_name}-style format options.
pyformat_args (dict):
Variable namespace to use for formatting.

Raises:
TypeError: if a referenced variable is not of a supported type.
KeyError: if a referenced variable is not found.
"""
fields = _parse_fields(sql_template)

format_kwargs = {}
for name in fields:
value = pyformat_args[name]
format_kwargs[name] = _field_to_template_value(name, value)

return sql_template.format(**format_kwargs)
18 changes: 15 additions & 3 deletions bigframes/core/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,22 @@
to_wkt = dumps


SIMPLE_LITERAL_TYPES = Union[
bytes,
str,
int,
bool,
float,
datetime.datetime,
datetime.date,
datetime.time,
decimal.Decimal,
list,
]


### Writing SQL Values (literals, column references, table references, etc.)
def simple_literal(
value: bytes | str | int | bool | float | datetime.datetime | list | None,
):
def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str:
"""Return quoted input string."""

# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
Expand Down
27 changes: 25 additions & 2 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import bigframes._config.bigquery_options as bigquery_options
import bigframes.clients
from bigframes.core import blocks
import bigframes.core.pyformat

# Even though the ibis.backends.bigquery import is unused, it's needed
# to register new and replacement ops with the Ibis BigQuery backend.
Expand Down Expand Up @@ -480,16 +481,38 @@ def _read_gbq_colab(
self,
query: str,
# TODO: Add a callback parameter that takes some kind of Event object.
# TODO: Add parameter for variables for string formatting.
# TODO: Add dry_run parameter.
*,
pyformat_args: Optional[Dict[str, Any]] = None,
) -> dataframe.DataFrame:
"""A version of read_gbq that has the necessary default values for use in colab integrations.

This includes, no ordering, no index, no progress bar, always use string
formatting for embedding local variables / dataframes.

Args:
query (str):
A SQL query string to execute. Results (if any) are turned into
a DataFrame.
pyformat_args (dict):
A dictionary of potential variables to replace in ``query``.
Note: strings are _not_ escaped. Use query parameters for these,
instead. Note: unlike read_gbq / read_gbq_query, even if set to
None, this function always assumes {var} refers to a variable
that is supposed to be supplied in this dictionary.
"""
# TODO: Allow for a table ID to avoid queries like with read_gbq?

if pyformat_args is None:
pyformat_args = {}

# TODO: move this to read_gbq_query if/when we expose this feature
# beyond in _read_gbq_colab.
query = bigframes.core.pyformat.pyformat(
query,
pyformat_args=pyformat_args,
)

# TODO: Allow for a table ID to avoid queries like read_gbq?
return self._loader.read_gbq_query(
query=query,
index_col=bigframes.enums.DefaultIndexKind.NULL,
Expand Down
66 changes: 66 additions & 0 deletions tests/system/small/session/test_read_gbq_colab.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

"""System tests for read_gbq_colab helper functions."""

import pandas
import pandas.testing


def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session):
df = maybe_ordered_session._read_gbq_colab(
Expand All @@ -39,3 +42,66 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi
total_rows += len(batch.index)

assert total_rows > 0


def test_read_gbq_colab_includes_formatted_scalars(session):
pyformat_args = {
"some_integer": 123,
"some_string": "This could be dangerous, but we esape it",
# This is not a supported type, but ignored if not referenced.
"some_object": object(),
}
df = session._read_gbq_colab(
"""
SELECT {some_integer} as some_integer,
{some_string} as some_string,
'{{escaped}}' as escaped
""",
pyformat_args=pyformat_args,
)
result = df.to_pandas()
pandas.testing.assert_frame_equal(
result,
pandas.DataFrame(
{
"some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
"some_string": pandas.Series(
["This could be dangerous, but we esape it"],
dtype="string[pyarrow]",
),
"escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
}
),
)


def test_read_gbq_colab_includes_formatted_bigframes_dataframe(session):
pyformat_args = {
# TODO: put a bigframes DataFrame here.
"some_integer": 123,
"some_string": "This could be dangerous, but we esape it",
# This is not a supported type, but ignored if not referenced.
"some_object": object(),
}
df = session._read_gbq_colab(
"""
SELECT {some_integer} as some_integer,
{some_string} as some_string,
'{{escaped}}' as escaped
""",
pyformat_args=pyformat_args,
)
result = df.to_pandas()
pandas.testing.assert_frame_equal(
result,
pandas.DataFrame(
{
"some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
"some_string": pandas.Series(
["This could be dangerous, but we esape it"],
dtype="string[pyarrow]",
),
"escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
}
),
)
Loading