Skip to content

Commit 85cede2

Browse files
authored
feat: add configuration option to read_gbq (#401)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes https://togithub.com/googleapis/python-bigquery-dataframes/issues/384 🦕
1 parent ad0e99e commit 85cede2

File tree

5 files changed

+145
-19
lines changed

5 files changed

+145
-19
lines changed

bigframes/pandas/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,9 +492,10 @@ def read_gbq(
492492
*,
493493
index_col: Iterable[str] | str = (),
494494
columns: Iterable[str] = (),
495+
configuration: Optional[Dict] = None,
495496
max_results: Optional[int] = None,
496497
filters: vendored_pandas_gbq.FiltersType = (),
497-
use_cache: bool = True,
498+
use_cache: Optional[bool] = None,
498499
col_order: Iterable[str] = (),
499500
) -> bigframes.dataframe.DataFrame:
500501
_set_default_session_location_if_possible(query_or_table)
@@ -503,6 +504,7 @@ def read_gbq(
503504
query_or_table,
504505
index_col=index_col,
505506
columns=columns,
507+
configuration=configuration,
506508
max_results=max_results,
507509
filters=filters,
508510
use_cache=use_cache,
@@ -528,8 +530,9 @@ def read_gbq_query(
528530
*,
529531
index_col: Iterable[str] | str = (),
530532
columns: Iterable[str] = (),
533+
configuration: Optional[Dict] = None,
531534
max_results: Optional[int] = None,
532-
use_cache: bool = True,
535+
use_cache: Optional[bool] = None,
533536
col_order: Iterable[str] = (),
534537
) -> bigframes.dataframe.DataFrame:
535538
_set_default_session_location_if_possible(query)
@@ -538,6 +541,7 @@ def read_gbq_query(
538541
query,
539542
index_col=index_col,
540543
columns=columns,
544+
configuration=configuration,
541545
max_results=max_results,
542546
use_cache=use_cache,
543547
col_order=col_order,

bigframes/session/__init__.py

Lines changed: 83 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from __future__ import annotations
1818

19+
import copy
1920
import datetime
2021
import itertools
2122
import logging
@@ -283,9 +284,10 @@ def read_gbq(
283284
*,
284285
index_col: Iterable[str] | str = (),
285286
columns: Iterable[str] = (),
287+
configuration: Optional[Dict] = None,
286288
max_results: Optional[int] = None,
287289
filters: third_party_pandas_gbq.FiltersType = (),
288-
use_cache: bool = True,
290+
use_cache: Optional[bool] = None,
289291
col_order: Iterable[str] = (),
290292
# Add a verify index argument that fails if the index is not unique.
291293
) -> dataframe.DataFrame:
@@ -306,6 +308,7 @@ def read_gbq(
306308
query_or_table,
307309
index_col=index_col,
308310
columns=columns,
311+
configuration=configuration,
309312
max_results=max_results,
310313
api_name="read_gbq",
311314
use_cache=use_cache,
@@ -314,13 +317,20 @@ def read_gbq(
314317
# TODO(swast): Query the snapshot table but mark it as a
315318
# deterministic query so we can avoid serializing if we have a
316319
# unique index.
320+
if configuration is not None:
321+
raise ValueError(
322+
"The 'configuration' argument is not allowed when "
323+
"directly reading from a table. Please remove "
324+
"'configuration' or use a query."
325+
)
326+
317327
return self._read_gbq_table(
318328
query_or_table,
319329
index_col=index_col,
320330
columns=columns,
321331
max_results=max_results,
322332
api_name="read_gbq",
323-
use_cache=use_cache,
333+
use_cache=use_cache if use_cache is not None else True,
324334
)
325335

326336
def _to_query(
@@ -405,7 +415,7 @@ def _query_to_destination(
405415
query: str,
406416
index_cols: List[str],
407417
api_name: str,
408-
use_cache: bool = True,
418+
configuration: dict = {"query": {"useQueryCache": True}},
409419
) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]:
410420
# If a dry_run indicates this is not a query type job, then don't
411421
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
@@ -427,23 +437,35 @@ def _query_to_destination(
427437
][:_MAX_CLUSTER_COLUMNS]
428438
temp_table = self._create_empty_temp_table(schema, cluster_cols)
429439

430-
job_config = bigquery.QueryJobConfig()
440+
timeout_ms = configuration.get("jobTimeoutMs") or configuration["query"].get(
441+
"timeoutMs"
442+
)
443+
444+
# Convert timeout_ms to seconds, ensuring a minimum of 0.1 seconds to avoid
445+
# the program getting stuck on too-short timeouts.
446+
timeout = max(int(timeout_ms) * 1e-3, 0.1) if timeout_ms else None
447+
448+
job_config = typing.cast(
449+
bigquery.QueryJobConfig,
450+
bigquery.QueryJobConfig.from_api_repr(configuration),
451+
)
431452
job_config.labels["bigframes-api"] = api_name
432453
job_config.destination = temp_table
433-
job_config.use_query_cache = use_cache
434454

435455
try:
436456
# Write to temp table to workaround BigQuery 10 GB query results
437457
# limit. See: internal issue 303057336.
438458
job_config.labels["error_caught"] = "true"
439-
_, query_job = self._start_query(query, job_config=job_config)
459+
_, query_job = self._start_query(
460+
query, job_config=job_config, timeout=timeout
461+
)
440462
return query_job.destination, query_job
441463
except google.api_core.exceptions.BadRequest:
442464
# Some SELECT statements still aren't compatible with cluster
443465
# tables as the destination. For example, if the query has a
444466
# top-level ORDER BY, this conflicts with our ability to cluster
445467
# the table by the index column(s).
446-
_, query_job = self._start_query(query)
468+
_, query_job = self._start_query(query, timeout=timeout)
447469
return query_job.destination, query_job
448470

449471
def read_gbq_query(
@@ -452,8 +474,9 @@ def read_gbq_query(
452474
*,
453475
index_col: Iterable[str] | str = (),
454476
columns: Iterable[str] = (),
477+
configuration: Optional[Dict] = None,
455478
max_results: Optional[int] = None,
456-
use_cache: bool = True,
479+
use_cache: Optional[bool] = None,
457480
col_order: Iterable[str] = (),
458481
) -> dataframe.DataFrame:
459482
"""Turn a SQL query into a DataFrame.
@@ -517,6 +540,7 @@ def read_gbq_query(
517540
query=query,
518541
index_col=index_col,
519542
columns=columns,
543+
configuration=configuration,
520544
max_results=max_results,
521545
api_name="read_gbq_query",
522546
use_cache=use_cache,
@@ -528,10 +552,34 @@ def _read_gbq_query(
528552
*,
529553
index_col: Iterable[str] | str = (),
530554
columns: Iterable[str] = (),
555+
configuration: Optional[Dict] = None,
531556
max_results: Optional[int] = None,
532557
api_name: str = "read_gbq_query",
533-
use_cache: bool = True,
558+
use_cache: Optional[bool] = None,
534559
) -> dataframe.DataFrame:
560+
configuration = _transform_read_gbq_configuration(configuration)
561+
562+
if "query" not in configuration:
563+
configuration["query"] = {}
564+
565+
if "query" in configuration["query"]:
566+
raise ValueError(
567+
"The query statement must not be included in the ",
568+
"'configuration' because it is already provided as",
569+
" a separate parameter.",
570+
)
571+
572+
if "useQueryCache" in configuration["query"]:
573+
if use_cache is not None:
574+
raise ValueError(
575+
"'useQueryCache' in 'configuration' conflicts with"
576+
" 'use_cache' parameter. Please specify only one."
577+
)
578+
else:
579+
configuration["query"]["useQueryCache"] = (
580+
True if use_cache is None else use_cache
581+
)
582+
535583
if isinstance(index_col, str):
536584
index_cols = [index_col]
537585
else:
@@ -541,7 +589,7 @@ def _read_gbq_query(
541589
query,
542590
index_cols,
543591
api_name=api_name,
544-
use_cache=use_cache,
592+
configuration=configuration,
545593
)
546594

547595
# If there was no destination table, that means the query must have
@@ -565,7 +613,7 @@ def _read_gbq_query(
565613
index_col=index_cols,
566614
columns=columns,
567615
max_results=max_results,
568-
use_cache=use_cache,
616+
use_cache=configuration["query"]["useQueryCache"],
569617
)
570618

571619
def read_gbq_table(
@@ -1656,13 +1704,14 @@ def _start_query(
16561704
sql: str,
16571705
job_config: Optional[bigquery.job.QueryJobConfig] = None,
16581706
max_results: Optional[int] = None,
1707+
timeout: Optional[float] = None,
16591708
) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
16601709
"""
16611710
Starts BigQuery query job and waits for results.
16621711
"""
16631712
job_config = self._prepare_query_job_config(job_config)
16641713
return bigframes.session._io.bigquery.start_query_with_client(
1665-
self.bqclient, sql, job_config, max_results
1714+
self.bqclient, sql, job_config, max_results, timeout
16661715
)
16671716

16681717
def _start_query_ml_ddl(
@@ -1876,3 +1925,25 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringVa
18761925
# Escape backslashes and use backslash as delineator
18771926
escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore
18781927
return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped)
1928+
1929+
1930+
def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict:
1931+
"""
1932+
For backwards-compatibility, convert any previously client-side only
1933+
parameters such as timeoutMs to the property name expected by the REST API.
1934+
1935+
Makes a copy of configuration if changes are needed.
1936+
"""
1937+
1938+
if configuration is None:
1939+
return {}
1940+
1941+
timeout_ms = configuration.get("query", {}).get("timeoutMs")
1942+
if timeout_ms is not None:
1943+
# Transform timeoutMs to an actual server-side configuration.
1944+
# https://github.com/googleapis/python-bigquery-pandas/issues/479
1945+
configuration = copy.deepcopy(configuration)
1946+
del configuration["query"]["timeoutMs"]
1947+
configuration["jobTimeoutMs"] = timeout_ms
1948+
1949+
return configuration

bigframes/session/_io/bigquery.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ def start_query_with_client(
220220
sql: str,
221221
job_config: bigquery.job.QueryJobConfig,
222222
max_results: Optional[int] = None,
223+
timeout: Optional[float] = None,
223224
) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
224225
"""
225226
Starts query job and waits for results.
@@ -230,7 +231,7 @@ def start_query_with_client(
230231
)
231232

232233
try:
233-
query_job = bq_client.query(sql, job_config=job_config)
234+
query_job = bq_client.query(sql, job_config=job_config, timeout=timeout)
234235
except google.api_core.exceptions.Forbidden as ex:
235236
if "Drive credentials" in ex.message:
236237
ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions."

tests/system/small/test_session.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import typing
2121
from typing import List
2222

23+
import google
2324
import google.cloud.bigquery as bigquery
2425
import numpy as np
2526
import pandas as pd
@@ -363,6 +364,47 @@ def test_read_gbq_table_wildcard_with_filter(session: bigframes.Session):
363364
assert df.shape == (348485, 32)
364365

365366

367+
@pytest.mark.parametrize(
368+
("config"),
369+
[
370+
{
371+
"query": {
372+
"useQueryCache": True,
373+
"maximumBytesBilled": "1000000000",
374+
"timeoutMs": 10000,
375+
}
376+
},
377+
pytest.param(
378+
{"query": {"useQueryCache": True, "timeoutMs": 50}},
379+
marks=pytest.mark.xfail(
380+
raises=google.api_core.exceptions.BadRequest,
381+
reason="Expected failure due to timeout being set too short.",
382+
),
383+
),
384+
pytest.param(
385+
{"query": {"useQueryCache": False, "maximumBytesBilled": "100"}},
386+
marks=pytest.mark.xfail(
387+
raises=google.api_core.exceptions.InternalServerError,
388+
reason="Expected failure when the query exceeds the maximum bytes billed limit.",
389+
),
390+
),
391+
],
392+
)
393+
def test_read_gbq_with_configuration(
394+
session: bigframes.Session, scalars_table_id: str, config: dict
395+
):
396+
query = f"""SELECT
397+
t.float64_col * 2 AS my_floats,
398+
CONCAT(t.string_col, "_2") AS my_strings,
399+
t.int64_col > 0 AS my_bools,
400+
FROM `{scalars_table_id}` AS t
401+
"""
402+
403+
df = session.read_gbq(query, configuration=config)
404+
405+
assert df.shape == (9, 3)
406+
407+
366408
def test_read_gbq_model(session, penguins_linear_model_name):
367409
model = session.read_gbq_model(penguins_linear_model_name)
368410
assert isinstance(model, bigframes.ml.linear_model.LinearRegression)

third_party/bigframes_vendored/pandas/io/gbq.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from __future__ import annotations
55

6-
from typing import Any, Iterable, Literal, Optional, Tuple, Union
6+
from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union
77

88
from bigframes import constants
99

@@ -19,9 +19,10 @@ def read_gbq(
1919
*,
2020
index_col: Iterable[str] | str = (),
2121
columns: Iterable[str] = (),
22+
configuration: Optional[Dict] = None,
2223
max_results: Optional[int] = None,
2324
filters: FiltersType = (),
24-
use_cache: bool = True,
25+
use_cache: Optional[bool] = None,
2526
col_order: Iterable[str] = (),
2627
):
2728
"""Loads a DataFrame from BigQuery.
@@ -107,6 +108,11 @@ def read_gbq(
107108
columns (Iterable[str]):
108109
List of BigQuery column names in the desired order for results
109110
DataFrame.
111+
configuration (dict, optional):
112+
Query config parameters for job processing.
113+
For example: configuration = {'query': {'useQueryCache': False}}.
114+
For more information see `BigQuery REST API Reference
115+
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
110116
max_results (Optional[int], default None):
111117
If set, limit the maximum number of rows to fetch from the
112118
query results.
@@ -121,8 +127,10 @@ def read_gbq(
121127
If using wildcard table suffix in query_or_table, can specify
122128
'_table_suffix' pseudo column to filter the tables to be read
123129
into the DataFrame.
124-
use_cache (bool, default True):
125-
Whether to cache the query inputs. Default to True.
130+
use_cache (Optional[bool], default None):
131+
Caches query results if set to `True`. When `None`, it behaves
132+
as `True`, but should not be combined with `useQueryCache` in
133+
`configuration` to avoid conflicts.
126134
col_order (Iterable[str]):
127135
Alias for columns, retained for backwards compatibility.
128136

0 commit comments

Comments
 (0)