From 290253f11062c73306500d1bd171fd1719d8ba01 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 5 May 2025 15:18:40 -0500 Subject: [PATCH 01/12] chore: add private _read_gbq_colab method that uses partial ordering mode, disables progress bars, disables default index, and communicates via callbacks --- bigframes/session/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 81866e0d32..656d597c1c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -440,6 +440,21 @@ def _register_object( ): self._objects.append(weakref.ref(object)) + def _read_gbq_colab( + self, + query_or_table: str, + *, + # TODO: type for paramter: some kind of Event + callback: Callable = lambda _: None, + # TODO: add parameter for variables for string formatting. + ) -> dataframe.DataFrame: + """A version of read_gbq that has the necessary default values for use in colab integrations. + + This includes, no ordering, no index, no progress bar, always use string + formatting for embedding local variables / dataframes. + """ + return self.read_gbq(query_or_table) + def read_gbq_query( self, query: str, From adc5fa867d5280f8f35bb4f53a8a08b8c4f816b2 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 12:22:25 -0500 Subject: [PATCH 02/12] add colab read gbq --- bigframes/session/__init__.py | 14 ++++++++----- bigframes/session/loader.py | 12 ++++++++++- tests/system/small/session/__init__.py | 13 ++++++++++++ .../small/session/test_read_gbq_colab.py | 20 +++++++++++++++++++ tests/unit/session/test_read_gbq_colab.py | 17 ++++++++++++++++ 5 files changed, 70 insertions(+), 6 deletions(-) create mode 100644 tests/system/small/session/__init__.py create mode 100644 tests/system/small/session/test_read_gbq_colab.py create mode 100644 tests/unit/session/test_read_gbq_colab.py diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 656d597c1c..e878c47d0e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -442,10 +442,8 @@ def _register_object( def _read_gbq_colab( self, - query_or_table: str, - *, - # TODO: type for paramter: some kind of Event - callback: Callable = lambda _: None, + query: str, + # TODO: Add a callback parameter that takes some kind of Event object. # TODO: add parameter for variables for string formatting. ) -> dataframe.DataFrame: """A version of read_gbq that has the necessary default values for use in colab integrations. @@ -453,7 +451,13 @@ def _read_gbq_colab( This includes, no ordering, no index, no progress bar, always use string formatting for embedding local variables / dataframes. """ - return self.read_gbq(query_or_table) + + # TODO: Allow for a table ID to avoid queries like read_gbq? + return self._loader.read_gbq_query( + query=query, + index_col=bigframes.enums.DefaultIndexKind.NULL, + api_name="read_gbq_colab", + ) def read_gbq_query( self, diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 2d554737ee..c3fb0a481b 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -369,6 +369,7 @@ def read_gbq_table( use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), enable_snapshot: bool = True, + force_total_order: Optional[bool] = None, ) -> dataframe.DataFrame: import bigframes._tools.strings import bigframes.dataframe as dataframe @@ -562,7 +563,14 @@ def read_gbq_table( session=self._session, ) # if we don't have a unique index, we order by row hash if we are in strict mode - if self._force_total_order: + if ( + # If the user has explicitly selected or disabled total ordering for + # this API call, respect that choice. + (force_total_order is not None and force_total_order) + # If the user has not explicitly selected or disabled total ordering + # for this API call, respect the default choice for the session. + or (force_total_order is None and self._force_total_order) + ): if not primary_key: array_value = array_value.order_by( [ @@ -664,6 +672,7 @@ def read_gbq_query( api_name: str = "read_gbq_query", use_cache: Optional[bool] = None, filters: third_party_pandas_gbq.FiltersType = (), + force_total_order: Optional[bool] = None, ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe @@ -743,6 +752,7 @@ def read_gbq_query( columns=columns, use_cache=configuration["query"]["useQueryCache"], api_name=api_name, + force_total_order=force_total_order, # max_results and filters are omitted because they are already # handled by to_query(), above. ) diff --git a/tests/system/small/session/__init__.py b/tests/system/small/session/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/session/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py new file mode 100644 index 0000000000..6cce2a62b7 --- /dev/null +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for read_gbq_colab helper functions.""" + + +def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): + _ = maybe_ordered_session._read_gbq_colab("""""") + # TODO: check that order by is preserved in to_pandas_batches() diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py new file mode 100644 index 0000000000..cef9665048 --- /dev/null +++ b/tests/unit/session/test_read_gbq_colab.py @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for read_gbq_colab helper functions.""" + +# TODO: Make sure job label is set. From 4b84737285876bcf417005264e9e8086f907d556 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 12:56:18 -0500 Subject: [PATCH 03/12] add test for ordering --- .../small/session/test_read_gbq_colab.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 6cce2a62b7..27cad16014 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -16,5 +16,25 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): - _ = maybe_ordered_session._read_gbq_colab("""""") - # TODO: check that order by is preserved in to_pandas_batches() + # TODO: why is the order destroyed in strict order mode? + df = maybe_ordered_session._read_gbq_colab( + """ + SELECT + name, + SUM(number) AS total + FROM + `bigquery-public-data.usa_names.usa_1910_2013` + WHERE state LIKE 'W%' + GROUP BY name + ORDER BY total DESC + LIMIT 300 + """ + ) + batches = df.to_pandas_batches(page_size=100) + + total_rows = 0 + for batch in batches: + assert batch["total"].is_monotonic_decreasing + total_rows += len(batch.index) + + assert total_rows > 0 From afd61ca4f74b700f268a9d5497df399bb9d14cfa Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 14:16:01 -0500 Subject: [PATCH 04/12] add ordered argument to to_pandas_batches --- bigframes/core/blocks.py | 4 +++- bigframes/dataframe.py | 5 +++++ tests/system/small/session/test_read_gbq_colab.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a587b8af31..e2bcc633ea 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -587,6 +587,8 @@ def to_pandas_batches( max_results: Optional[int] = None, allow_large_results: Optional[bool] = None, squeeze: Optional[bool] = False, + *, + ordered: bool = True, ): """Download results one message at a time. @@ -594,7 +596,7 @@ def to_pandas_batches( see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" execute_result = self.session._executor.execute( self.expr, - ordered=True, + ordered=ordered, use_explicit_destination=allow_large_results, page_size=page_size, max_results=max_results, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9cb388329e..54bcb07385 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1779,6 +1779,7 @@ def to_pandas_batches( max_results: Optional[int] = None, *, allow_large_results: Optional[bool] = None, + ordered: bool = True, ) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame. @@ -1822,6 +1823,9 @@ def to_pandas_batches( allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. + ordered (bool, default True): + Determines whether the resulting pandas dataframes will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: Iterable[pandas.DataFrame]: @@ -1833,6 +1837,7 @@ def to_pandas_batches( page_size=page_size, max_results=max_results, allow_large_results=allow_large_results, + ordered=ordered, ) def _compute_dry_run(self) -> bigquery.QueryJob: diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 27cad16014..fcbc099412 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -30,7 +30,7 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi LIMIT 300 """ ) - batches = df.to_pandas_batches(page_size=100) + batches = df.to_pandas_batches(page_size=100, ordered=False) total_rows = 0 for batch in batches: From 5e0dd948ab84c970c8e7ec4edf55762985b06f8e Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 14:53:32 -0500 Subject: [PATCH 05/12] add unit test looking for job labels --- bigframes/testing/mocks.py | 16 +++++++++++++--- .../system/small/session/test_read_gbq_colab.py | 8 ++++++-- tests/unit/session/test_read_gbq_colab.py | 17 ++++++++++++++++- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index ab48b97f0d..f73d9962ef 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -51,6 +51,9 @@ def create_bigquery_session( google.auth.credentials.Credentials, instance=True ) + bq_time = datetime.datetime.now() + table_time = bq_time + datetime.timedelta(minutes=1) + if anonymous_dataset is None: anonymous_dataset = google.cloud.bigquery.DatasetReference( "test-project", @@ -65,6 +68,8 @@ def create_bigquery_session( # Mock the location. table = mock.create_autospec(google.cloud.bigquery.Table, instance=True) table._properties = {} + # TODO(tswast): support tables created before and after the session started. + type(table).created = mock.PropertyMock(return_value=table_time) type(table).location = mock.PropertyMock(return_value=location) type(table).schema = mock.PropertyMock(return_value=table_schema) type(table).reference = mock.PropertyMock( @@ -73,7 +78,10 @@ def create_bigquery_session( type(table).num_rows = mock.PropertyMock(return_value=1000000000) bqclient.get_table.return_value = table - def query_mock(query, *args, **kwargs): + job_configs = [] + + def query_mock(query, *args, job_config=None, **kwargs): + job_configs.append(job_config) query_job = mock.create_autospec(google.cloud.bigquery.QueryJob) type(query_job).destination = mock.PropertyMock( return_value=anonymous_dataset.table("test_table"), @@ -83,7 +91,7 @@ def query_mock(query, *args, **kwargs): ) if query.startswith("SELECT CURRENT_TIMESTAMP()"): - query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]]) + query_job.result = mock.MagicMock(return_value=[[bq_time]]) else: type(query_job).schema = mock.PropertyMock(return_value=table_schema) @@ -91,7 +99,8 @@ def query_mock(query, *args, **kwargs): existing_query_and_wait = bqclient.query_and_wait - def query_and_wait_mock(query, *args, **kwargs): + def query_and_wait_mock(query, *args, job_config=None, **kwargs): + job_configs.append(job_config) if query.startswith("SELECT CURRENT_TIMESTAMP()"): return iter([[datetime.datetime.now()]]) else: @@ -109,6 +118,7 @@ def query_and_wait_mock(query, *args, **kwargs): session._bq_connection_manager = mock.create_autospec( bigframes.clients.BqConnectionManager, instance=True ) + session._job_configs = job_configs return session diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index fcbc099412..170ab80e84 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -16,7 +16,6 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): - # TODO: why is the order destroyed in strict order mode? df = maybe_ordered_session._read_gbq_colab( """ SELECT @@ -30,7 +29,12 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi LIMIT 300 """ ) - batches = df.to_pandas_batches(page_size=100, ordered=False) + batches = df.to_pandas_batches( + page_size=100, + # In the real app, this will need to be set to False for the raw query + # results and to True if any subsequent ordering has been applied. + ordered=False, + ) total_rows = 0 for batch in batches: diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index cef9665048..ddca220a79 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -14,4 +14,19 @@ """Unit tests for read_gbq_colab helper functions.""" -# TODO: Make sure job label is set. +from bigframes.testing import mocks + + +def test_read_gbq_colab_includes_label(): + """Make sure we can tell direct colab usage apart from regular read_gbq usage.""" + session = mocks.create_bigquery_session() + _ = session._read_gbq_colab("SELECT 'read-gbq-colab-test'") + configs = session._job_configs # type: ignore + + label_values = [] + for config in configs: + if config is None: + continue + label_values.extend(config.labels.values()) + + assert "read_gbq_colab" in label_values From 92ce621cfcb0dae8c7e4fea3f97a57eea2c60c44 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 15:19:12 -0500 Subject: [PATCH 06/12] remove ordered option for to_pandas_batches --- bigframes/core/blocks.py | 4 +--- bigframes/dataframe.py | 5 ----- tests/system/small/session/test_read_gbq_colab.py | 3 --- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9fb20b4882..d3107a0623 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -577,8 +577,6 @@ def to_pandas_batches( max_results: Optional[int] = None, allow_large_results: Optional[bool] = None, squeeze: Optional[bool] = False, - *, - ordered: bool = True, ): """Download results one message at a time. @@ -586,7 +584,7 @@ def to_pandas_batches( see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" execute_result = self.session._executor.execute( self.expr, - ordered=ordered, + ordered=True, use_explicit_destination=allow_large_results, ) for df in execute_result.to_pandas_batches( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 54bcb07385..9cb388329e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1779,7 +1779,6 @@ def to_pandas_batches( max_results: Optional[int] = None, *, allow_large_results: Optional[bool] = None, - ordered: bool = True, ) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame. @@ -1823,9 +1822,6 @@ def to_pandas_batches( allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. - ordered (bool, default True): - Determines whether the resulting pandas dataframes will be ordered. - In some cases, unordered may result in a faster-executing query. Returns: Iterable[pandas.DataFrame]: @@ -1837,7 +1833,6 @@ def to_pandas_batches( page_size=page_size, max_results=max_results, allow_large_results=allow_large_results, - ordered=ordered, ) def _compute_dry_run(self) -> bigquery.QueryJob: diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 170ab80e84..7ade85b2b2 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -31,9 +31,6 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi ) batches = df.to_pandas_batches( page_size=100, - # In the real app, this will need to be set to False for the raw query - # results and to True if any subsequent ordering has been applied. - ordered=False, ) total_rows = 0 From 9e3d5c8e3c1223415e9114daa2dd7c0d3bb15066 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 6 May 2025 16:56:58 -0500 Subject: [PATCH 07/12] ignore type for mock job configs --- bigframes/testing/mocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index f73d9962ef..d0cfb50ad5 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -118,7 +118,7 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): session._bq_connection_manager = mock.create_autospec( bigframes.clients.BqConnectionManager, instance=True ) - session._job_configs = job_configs + session._job_configs = job_configs # type: ignore return session From fa3a98ae1ec2d0a8a9ec16555847f3abad9450b9 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 7 May 2025 16:34:45 -0500 Subject: [PATCH 08/12] chore: add pyformat_args to _read_gbq_colab --- bigframes/core/pyformat.py | 80 +++++++++++++ bigframes/session/__init__.py | 18 ++- tests/unit/core/test_pyformat.py | 191 +++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 bigframes/core/pyformat.py create mode 100644 tests/unit/core/test_pyformat.py diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py new file mode 100644 index 0000000000..c42e40712b --- /dev/null +++ b/bigframes/core/pyformat.py @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the pyformat feature.""" + +# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See: +# https://github.com/googleapis/python-bigquery-magics/blob/main/bigquery_magics/pyformat.py + +from __future__ import annotations + +import numbers +import string +from typing import Any + + +def _field_to_template_value(name: str, value: Any) -> Any: + """Convert value to something embeddable in a SQL string. + + Does **not** escape strings. + """ + _validate_type(name, value) + # TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery. + return value + + +def _validate_type(name: str, value: Any): + """Raises TypeError if value is unsupported.""" + if not isinstance(value, (str, numbers.Real)): + raise TypeError( + f"{name} has unsupported type: {type(value)}. " + "Only str, int, float are supported." + ) + + +def _parse_fields(sql_template: str) -> list[str]: + return [ + field_name + for _, field_name, _, _ in string.Formatter().parse(sql_template) + if field_name is not None + ] + + +def pyformat(sql_template: str, pyformat_args: dict) -> str: + """Unsafe Python-style string formatting of SQL string. + + Only some data types supported. + + Warning: strings are **not** escaped. This allows them to be used in + contexts such as table identifiers, where normal query parameters are not + supported. + + Args: + sql_template (str): + SQL string with 0+ {var_name}-style format options. + pyformat_args (dict): + Variable namespace to use for formatting. + + Raises: + TypeError: if a referenced variable is not of a supported type. + KeyError: if a referenced variable is not found. + """ + fields = _parse_fields(sql_template) + + format_kwargs = {} + for name in fields: + value = pyformat_args[name] + format_kwargs[name] = _field_to_template_value(name, value) + + return sql_template.format(**format_kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7260553c14..054c2c384f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -480,16 +480,30 @@ def _read_gbq_colab( self, query: str, # TODO: Add a callback parameter that takes some kind of Event object. - # TODO: Add parameter for variables for string formatting. # TODO: Add dry_run parameter. + *, + pyformat_args: Optional[Dict[str, Any]] = None, ) -> dataframe.DataFrame: """A version of read_gbq that has the necessary default values for use in colab integrations. This includes, no ordering, no index, no progress bar, always use string formatting for embedding local variables / dataframes. + + Args: + query (str): + A SQL query string to execute. Results (if any) are turned into + a DataFrame. + pyformat_args (dict): + A dictionary of potential variables to replace in ``query``. + Note: strings are _not_ escaped. Use query parameters for these, + instead. Note: unlike read_gbq / read_gbq_query, even if set to + None, this function always assumes {var} refers to a variable + that is supposed to be supplied in this dictionary. """ + if pyformat_args is None: + pyformat_args = {} - # TODO: Allow for a table ID to avoid queries like read_gbq? + # TODO: Allow for a table ID to avoid queries like with read_gbq? return self._loader.read_gbq_query( query=query, index_col=bigframes.enums.DefaultIndexKind.NULL, diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py new file mode 100644 index 0000000000..73e2e51c67 --- /dev/null +++ b/tests/unit/core/test_pyformat.py @@ -0,0 +1,191 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the pyformat feature.""" + +# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See: +# https://github.com/googleapis/python-bigquery-magics/blob/main/tests/unit/bigquery/test_pyformat.py + +from __future__ import annotations + +from typing import List +from unittest import mock + +import pytest + + +@pytest.mark.parametrize( + ("sql_template", "expected"), + ( + ( + "{my_project}.{my_dataset}.{my_table}", + ["my_project", "my_dataset", "my_table"], + ), + ( + "{{not a format variable}}", + [], + ), + ), +) +def test_parse_fields(sql_template: str, expected: List[str]): + import bigframes.core.pyformat + + fields = bigframes.core.pyformat._parse_fields(sql_template) + fields.sort() + expected.sort() + assert fields == expected + + +@pytest.mark.usefixtures("mock_credentials") +def test_pyformat_with_unsupported_type_raises_typeerror(ipython_ns_cleanup): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + ipython_ns_cleanup.extend( + [ + (ip, "my_object"), + ] + ) + ip.user_ns["my_object"] = object() + + sql = "SELECT {my_object}" + + with pytest.raises(TypeError, match="my_object has unsupported type: "): + ip.run_cell_magic("bigquery", "--pyformat", sql) + + +@pytest.mark.usefixtures("mock_credentials") +def test_pyformat_with_missing_variable_raises_keyerror(): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + + sql = "SELECT {my_object}" + + with pytest.raises(KeyError, match="my_object"): + ip.run_cell_magic("bigquery", "--pyformat", sql) + + +@pytest.mark.usefixtures("mock_credentials") +def test_pyformat_with_no_variables(): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + + run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) + + sql = "SELECT '{{escaped curly brackets}}'" + expected_sql = "SELECT '{escaped curly brackets}'" + + with run_query_patch as run_query_mock: + ip.run_cell_magic("bigquery", "--pyformat", sql) + run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) + + +@pytest.mark.usefixtures("mock_credentials") +def test_pyformat_with_query_string_replaces_variables(ipython_ns_cleanup): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + ipython_ns_cleanup.extend( + [ + # String + (ip, "project"), + (ip, "dataset"), + (ip, "table"), + (ip, "my_string"), + # Float + (ip, "max_value"), + # Integer + (ip, "year"), + # Unused, unsupported types shouldn't raise. + (ip, "my_object"), + ] + ) + ip.user_ns["project"] = "pyformat-project" + ip.user_ns["dataset"] = "pyformat_dataset" + ip.user_ns["table"] = "pyformat_table" + ip.user_ns["my_string"] = "some string value" + ip.user_ns["max_value"] = 2.25 + ip.user_ns["year"] = 2025 + ip.user_ns["my_object"] = object() + + sql = """ + SELECT {year} - year AS age, + @myparam AS myparam, + '{{my_string}}' AS escaped_string, + FROM `{project}.{dataset}.{table}` + WHERE height < {max_value} + """ + expected_sql = """ + SELECT 2025 - year AS age, + @myparam AS myparam, + '{my_string}' AS escaped_string, + FROM `pyformat-project.pyformat_dataset.pyformat_table` + WHERE height < 2.25 + """.strip() + + run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) + + with run_query_patch as run_query_mock: + ip.run_cell_magic("bigquery", "--pyformat --params {'myparam': 42}", sql) + run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) + + +@pytest.mark.usefixtures("mock_credentials") +def test_pyformat_with_query_dollar_variable_replaces_variables(ipython_ns_cleanup): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + + ipython_ns_cleanup.extend( + [ + (ip, "custom_query"), + (ip, "my_string"), + ] + ) + run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) + + sql = "SELECT 42, '{my_string}'" + expected_sql = "SELECT 42, 'This is a test'" + ip.user_ns["my_string"] = "This is a test" + ip.user_ns["custom_query"] = sql + + cell_body = "$custom_query" # Referring to an existing variable name (custom_query) + + with run_query_patch as run_query_mock: + ip.run_cell_magic("bigquery", "--pyformat", cell_body) + run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) + + +@pytest.mark.usefixtures("mock_credentials") +def test_without_pyformat_doesnt_modify_curly_brackets(ipython_ns_cleanup): + globalipapp.start_ipython() + ip = globalipapp.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + + ipython_ns_cleanup.extend( + [ + (ip, "my_string"), + ] + ) + run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) + + sql = "SELECT 42, '{my_string}'" + expected_sql = "SELECT 42, '{my_string}'" # No pyformat means no escaping needed. + ip.user_ns["my_string"] = "This is a test" + + with run_query_patch as run_query_mock: + ip.run_cell_magic("bigquery", "", sql) + run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) From eec8cca4ae0960d4831a11239d82d9bd6ae07351 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 8 May 2025 12:22:15 -0500 Subject: [PATCH 09/12] fix unit tests --- bigframes/core/pyformat.py | 8 +- bigframes/session/__init__.py | 11 ++- tests/unit/core/test_pyformat.py | 147 ++++++------------------------- 3 files changed, 46 insertions(+), 120 deletions(-) diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index c42e40712b..1a2c1f8a0e 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -51,7 +51,13 @@ def _parse_fields(sql_template: str) -> list[str]: ] -def pyformat(sql_template: str, pyformat_args: dict) -> str: +def pyformat( + sql_template: str, + *, + pyformat_args: dict, + # TODO: add dry_run parameter to avoid expensive API calls in conversion + # TODO: and session to upload data / convert to table if necessary +) -> str: """Unsafe Python-style string formatting of SQL string. Only some data types supported. diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 054c2c384f..b54dc1d691 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -61,6 +61,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients from bigframes.core import blocks +import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. @@ -500,10 +501,18 @@ def _read_gbq_colab( None, this function always assumes {var} refers to a variable that is supposed to be supplied in this dictionary. """ + # TODO: Allow for a table ID to avoid queries like with read_gbq? + if pyformat_args is None: pyformat_args = {} - # TODO: Allow for a table ID to avoid queries like with read_gbq? + # TODO: move this to read_gbq_query if/when we expose this feature + # beyond in _read_gbq_colab. + query = bigframes.core.pyformat.pyformat( + query, + pyformat_args=pyformat_args, + ) + return self._loader.read_gbq_query( query=query, index_col=bigframes.enums.DefaultIndexKind.NULL, diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 73e2e51c67..881dbd8b63 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -19,11 +19,12 @@ from __future__ import annotations -from typing import List -from unittest import mock +from typing import Any, Dict, List import pytest +import bigframes.core.pyformat as pyformat + @pytest.mark.parametrize( ("sql_template", "expected"), @@ -39,87 +40,47 @@ ), ) def test_parse_fields(sql_template: str, expected: List[str]): - import bigframes.core.pyformat - - fields = bigframes.core.pyformat._parse_fields(sql_template) + fields = pyformat._parse_fields(sql_template) fields.sort() expected.sort() assert fields == expected -@pytest.mark.usefixtures("mock_credentials") -def test_pyformat_with_unsupported_type_raises_typeerror(ipython_ns_cleanup): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - ipython_ns_cleanup.extend( - [ - (ip, "my_object"), - ] - ) - ip.user_ns["my_object"] = object() - +def test_pyformat_with_unsupported_type_raises_typeerror(): + pyformat_args = {"my_object": object()} sql = "SELECT {my_object}" with pytest.raises(TypeError, match="my_object has unsupported type: "): - ip.run_cell_magic("bigquery", "--pyformat", sql) + pyformat.pyformat(sql, pyformat_args=pyformat_args) -@pytest.mark.usefixtures("mock_credentials") def test_pyformat_with_missing_variable_raises_keyerror(): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - + pyformat_args: Dict[str, Any] = {} sql = "SELECT {my_object}" with pytest.raises(KeyError, match="my_object"): - ip.run_cell_magic("bigquery", "--pyformat", sql) + pyformat.pyformat(sql, pyformat_args=pyformat_args) -@pytest.mark.usefixtures("mock_credentials") def test_pyformat_with_no_variables(): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - - run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) - + pyformat_args: Dict[str, Any] = {} sql = "SELECT '{{escaped curly brackets}}'" expected_sql = "SELECT '{escaped curly brackets}'" - - with run_query_patch as run_query_mock: - ip.run_cell_magic("bigquery", "--pyformat", sql) - run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) - - -@pytest.mark.usefixtures("mock_credentials") -def test_pyformat_with_query_string_replaces_variables(ipython_ns_cleanup): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - ipython_ns_cleanup.extend( - [ - # String - (ip, "project"), - (ip, "dataset"), - (ip, "table"), - (ip, "my_string"), - # Float - (ip, "max_value"), - # Integer - (ip, "year"), - # Unused, unsupported types shouldn't raise. - (ip, "my_object"), - ] - ) - ip.user_ns["project"] = "pyformat-project" - ip.user_ns["dataset"] = "pyformat_dataset" - ip.user_ns["table"] = "pyformat_table" - ip.user_ns["my_string"] = "some string value" - ip.user_ns["max_value"] = 2.25 - ip.user_ns["year"] = 2025 - ip.user_ns["my_object"] = object() + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql + + +def test_pyformat_with_query_string_replaces_variables(): + pyformat_args = { + "project": "pyformat-project", + "dataset": "pyformat_dataset", + "table": "pyformat_table", + "my_string": "some string value", + "max_value": 2.25, + "year": 2025, + # Unreferenced values of unsupported type shouldn't cause issues. + "my_object": object(), + } sql = """ SELECT {year} - year AS age, @@ -127,7 +88,8 @@ def test_pyformat_with_query_string_replaces_variables(ipython_ns_cleanup): '{{my_string}}' AS escaped_string, FROM `{project}.{dataset}.{table}` WHERE height < {max_value} - """ + """.strip() + expected_sql = """ SELECT 2025 - year AS age, @myparam AS myparam, @@ -136,56 +98,5 @@ def test_pyformat_with_query_string_replaces_variables(ipython_ns_cleanup): WHERE height < 2.25 """.strip() - run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) - - with run_query_patch as run_query_mock: - ip.run_cell_magic("bigquery", "--pyformat --params {'myparam': 42}", sql) - run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) - - -@pytest.mark.usefixtures("mock_credentials") -def test_pyformat_with_query_dollar_variable_replaces_variables(ipython_ns_cleanup): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - - ipython_ns_cleanup.extend( - [ - (ip, "custom_query"), - (ip, "my_string"), - ] - ) - run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) - - sql = "SELECT 42, '{my_string}'" - expected_sql = "SELECT 42, 'This is a test'" - ip.user_ns["my_string"] = "This is a test" - ip.user_ns["custom_query"] = sql - - cell_body = "$custom_query" # Referring to an existing variable name (custom_query) - - with run_query_patch as run_query_mock: - ip.run_cell_magic("bigquery", "--pyformat", cell_body) - run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) - - -@pytest.mark.usefixtures("mock_credentials") -def test_without_pyformat_doesnt_modify_curly_brackets(ipython_ns_cleanup): - globalipapp.start_ipython() - ip = globalipapp.get_ipython() - ip.extension_manager.load_extension("bigquery_magics") - - ipython_ns_cleanup.extend( - [ - (ip, "my_string"), - ] - ) - run_query_patch = mock.patch("bigquery_magics.bigquery._run_query", autospec=True) - - sql = "SELECT 42, '{my_string}'" - expected_sql = "SELECT 42, '{my_string}'" # No pyformat means no escaping needed. - ip.user_ns["my_string"] = "This is a test" - - with run_query_patch as run_query_mock: - ip.run_cell_magic("bigquery", "", sql) - run_query_mock.assert_called_once_with(mock.ANY, expected_sql, mock.ANY) + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql From 8b086a28d27ee75f78c77f518fcb93c368fb4e00 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 8 May 2025 12:34:19 -0500 Subject: [PATCH 10/12] add test for _read_gbq_colab --- .../small/session/test_read_gbq_colab.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 7ade85b2b2..76287ae805 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -14,6 +14,9 @@ """System tests for read_gbq_colab helper functions.""" +import pandas +import pandas.testing + def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): df = maybe_ordered_session._read_gbq_colab( @@ -39,3 +42,34 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi total_rows += len(batch.index) assert total_rows > 0 + + +def test_read_gbq_colab_includes_formatted_scalars(session): + pyformat_args = { + "some_integer": 123, + "some_string": "This is dangerous, but useful for table IDs", + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT {some_integer} as some_integer, + '{some_string}' as some_string, + '{{escaped}}' as escaped + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + pandas.testing.assert_frame_equal( + result, + pandas.DataFrame( + { + "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), + "some_string": pandas.Series( + ["This is dangerous, but useful for table IDs"], + dtype="string[pyarrow]", + ), + "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), + } + ), + ) From 8d22c0ac3e5472b55ab6004bafd8fd855c1378b2 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Fri, 9 May 2025 10:35:52 -0500 Subject: [PATCH 11/12] escape strings --- bigframes/core/pyformat.py | 40 +++++++++++---- bigframes/core/sql.py | 19 +++++-- .../small/session/test_read_gbq_colab.py | 38 ++++++++++++-- tests/unit/core/test_pyformat.py | 50 +++++++++++++++++-- 4 files changed, 127 insertions(+), 20 deletions(-) diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 1a2c1f8a0e..8b64396731 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -19,27 +19,49 @@ from __future__ import annotations -import numbers import string -from typing import Any +import typing +from typing import Any, Union +import google.cloud.bigquery +import google.cloud.bigquery.table -def _field_to_template_value(name: str, value: Any) -> Any: - """Convert value to something embeddable in a SQL string. +_BQ_TABLE_TYPES = Union[ + google.cloud.bigquery.Table, + google.cloud.bigquery.TableReference, + google.cloud.bigquery.table.TableListItem, +] + + +def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: + return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + + +def _field_to_template_value(name: str, value: Any) -> str: + """Convert value to something embeddable in a SQL string.""" + import bigframes.core.sql # Avoid circular imports - Does **not** escape strings. - """ _validate_type(name, value) + + table_types = typing.get_args(_BQ_TABLE_TYPES) + if isinstance(value, table_types): + return _table_to_sql(value) + # TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery. - return value + return bigframes.core.sql.simple_literal(value) def _validate_type(name: str, value: Any): """Raises TypeError if value is unsupported.""" - if not isinstance(value, (str, numbers.Real)): + import bigframes.core.sql # Avoid circular imports + + supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args( + bigframes.core.sql.SIMPLE_LITERAL_TYPES + ) + if not isinstance(value, supported_types): raise TypeError( f"{name} has unsupported type: {type(value)}. " - "Only str, int, float are supported." + f"Only {supported_types} are supported." ) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 04e678e713..328da78486 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -42,10 +42,23 @@ to_wkt = dumps +SIMPLE_LITERAL_TYPES = Union[ + None, + bytes, + str, + int, + bool, + float, + datetime.datetime, + datetime.date, + datetime.time, + decimal.Decimal, + list, +] + + ### Writing SQL Values (literals, column references, table references, etc.) -def simple_literal( - value: bytes | str | int | bool | float | datetime.datetime | list | None, -): +def simple_literal(value: SIMPLE_LITERAL_TYPES) -> str: """Return quoted input string.""" # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 76287ae805..54fdd4014e 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -47,14 +47,14 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi def test_read_gbq_colab_includes_formatted_scalars(session): pyformat_args = { "some_integer": 123, - "some_string": "This is dangerous, but useful for table IDs", + "some_string": "This could be dangerous, but we esape it", # This is not a supported type, but ignored if not referenced. "some_object": object(), } df = session._read_gbq_colab( """ SELECT {some_integer} as some_integer, - '{some_string}' as some_string, + {some_string} as some_string, '{{escaped}}' as escaped """, pyformat_args=pyformat_args, @@ -66,7 +66,39 @@ def test_read_gbq_colab_includes_formatted_scalars(session): { "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), "some_string": pandas.Series( - ["This is dangerous, but useful for table IDs"], + ["This could be dangerous, but we esape it"], + dtype="string[pyarrow]", + ), + "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), + } + ), + ) + + +def test_read_gbq_colab_includes_formatted_bigframes_dataframe(session): + pyformat_args = { + # TODO: put a bigframes DataFrame here. + "some_integer": 123, + "some_string": "This could be dangerous, but we esape it", + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT {some_integer} as some_integer, + {some_string} as some_string, + '{{escaped}}' as escaped + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + pandas.testing.assert_frame_equal( + result, + pandas.DataFrame( + { + "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), + "some_string": pandas.Series( + ["This could be dangerous, but we esape it"], dtype="string[pyarrow]", ), "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 881dbd8b63..69ca4000bc 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -21,6 +21,8 @@ from typing import Any, Dict, List +import google.cloud.bigquery +import google.cloud.bigquery.table import pytest import bigframes.core.pyformat as pyformat @@ -72,9 +74,6 @@ def test_pyformat_with_no_variables(): def test_pyformat_with_query_string_replaces_variables(): pyformat_args = { - "project": "pyformat-project", - "dataset": "pyformat_dataset", - "table": "pyformat_table", "my_string": "some string value", "max_value": 2.25, "year": 2025, @@ -86,7 +85,8 @@ def test_pyformat_with_query_string_replaces_variables(): SELECT {year} - year AS age, @myparam AS myparam, '{{my_string}}' AS escaped_string, - FROM `{project}.{dataset}.{table}` + {my_string} AS my_string, + FROM my_dataset.my_table WHERE height < {max_value} """.strip() @@ -94,9 +94,49 @@ def test_pyformat_with_query_string_replaces_variables(): SELECT 2025 - year AS age, @myparam AS myparam, '{my_string}' AS escaped_string, - FROM `pyformat-project.pyformat_dataset.pyformat_table` + 'some string value' AS my_string, + FROM my_dataset.my_table WHERE height < 2.25 """.strip() got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) assert got_sql == expected_sql + + +@pytest.mark.parametrize( + ("table", "expected_sql"), + ( + ( + google.cloud.bigquery.Table("my-project.my_dataset.my_table"), + "SELECT * FROM `my-project`.`my_dataset`.`my_table`", + ), + ( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference("some-project", "some_dataset"), + "some_table", + ), + "SELECT * FROM `some-project`.`some_dataset`.`some_table`", + ), + ( + google.cloud.bigquery.table.TableListItem( + { + "tableReference": { + "projectId": "ListedProject", + "datasetId": "ListedDataset", + "tableId": "ListedTable", + } + } + ), + "SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`", + ), + ), +) +def test_pyformat_with_table_replaces_variables(table, expected_sql): + pyformat_args = { + "table": table, + # Unreferenced values of unsupported type shouldn't cause issues. + "my_object": object(), + } + sql = "SELECT * FROM {table}" + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql From efbaa8b9a35fe82c7dbcc57831e7f73b0c900d91 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Fri, 9 May 2025 10:39:14 -0500 Subject: [PATCH 12/12] fix null support --- bigframes/core/pyformat.py | 3 +++ bigframes/core/sql.py | 3 +-- tests/unit/core/test_pyformat.py | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 8b64396731..98f175d300 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -55,6 +55,9 @@ def _validate_type(name: str, value: Any): """Raises TypeError if value is unsupported.""" import bigframes.core.sql # Avoid circular imports + if value is None: + return # None can't be used in isinstance, but is a valid literal. + supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args( bigframes.core.sql.SIMPLE_LITERAL_TYPES ) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 328da78486..ccd2a16ddc 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -43,7 +43,6 @@ SIMPLE_LITERAL_TYPES = Union[ - None, bytes, str, int, @@ -58,7 +57,7 @@ ### Writing SQL Values (literals, column references, table references, etc.) -def simple_literal(value: SIMPLE_LITERAL_TYPES) -> str: +def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: """Return quoted input string.""" # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index 69ca4000bc..466f3d6116 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -77,6 +77,7 @@ def test_pyformat_with_query_string_replaces_variables(): "my_string": "some string value", "max_value": 2.25, "year": 2025, + "null_value": None, # Unreferenced values of unsupported type shouldn't cause issues. "my_object": object(), } @@ -86,6 +87,7 @@ def test_pyformat_with_query_string_replaces_variables(): @myparam AS myparam, '{{my_string}}' AS escaped_string, {my_string} AS my_string, + {null_value} AS null_value, FROM my_dataset.my_table WHERE height < {max_value} """.strip() @@ -95,6 +97,7 @@ def test_pyformat_with_query_string_replaces_variables(): @myparam AS myparam, '{my_string}' AS escaped_string, 'some string value' AS my_string, + NULL AS null_value, FROM my_dataset.my_table WHERE height < 2.25 """.strip()