16
16
17
17
from __future__ import annotations
18
18
19
+ import copy
19
20
import datetime
20
21
import itertools
21
22
import logging
@@ -283,9 +284,10 @@ def read_gbq(
283
284
* ,
284
285
index_col : Iterable [str ] | str = (),
285
286
columns : Iterable [str ] = (),
287
+ configuration : Optional [Dict ] = None ,
286
288
max_results : Optional [int ] = None ,
287
289
filters : third_party_pandas_gbq .FiltersType = (),
288
- use_cache : bool = True ,
290
+ use_cache : Optional [ bool ] = None ,
289
291
col_order : Iterable [str ] = (),
290
292
# Add a verify index argument that fails if the index is not unique.
291
293
) -> dataframe .DataFrame :
@@ -306,6 +308,7 @@ def read_gbq(
306
308
query_or_table ,
307
309
index_col = index_col ,
308
310
columns = columns ,
311
+ configuration = configuration ,
309
312
max_results = max_results ,
310
313
api_name = "read_gbq" ,
311
314
use_cache = use_cache ,
@@ -314,13 +317,20 @@ def read_gbq(
314
317
# TODO(swast): Query the snapshot table but mark it as a
315
318
# deterministic query so we can avoid serializing if we have a
316
319
# unique index.
320
+ if configuration is not None :
321
+ raise ValueError (
322
+ "The 'configuration' argument is not allowed when "
323
+ "directly reading from a table. Please remove "
324
+ "'configuration' or use a query."
325
+ )
326
+
317
327
return self ._read_gbq_table (
318
328
query_or_table ,
319
329
index_col = index_col ,
320
330
columns = columns ,
321
331
max_results = max_results ,
322
332
api_name = "read_gbq" ,
323
- use_cache = use_cache ,
333
+ use_cache = use_cache if use_cache is not None else True ,
324
334
)
325
335
326
336
def _to_query (
@@ -405,7 +415,7 @@ def _query_to_destination(
405
415
query : str ,
406
416
index_cols : List [str ],
407
417
api_name : str ,
408
- use_cache : bool = True ,
418
+ configuration : dict = { "query" : { "useQueryCache" : True }} ,
409
419
) -> Tuple [Optional [bigquery .TableReference ], Optional [bigquery .QueryJob ]]:
410
420
# If a dry_run indicates this is not a query type job, then don't
411
421
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
@@ -427,23 +437,35 @@ def _query_to_destination(
427
437
][:_MAX_CLUSTER_COLUMNS ]
428
438
temp_table = self ._create_empty_temp_table (schema , cluster_cols )
429
439
430
- job_config = bigquery .QueryJobConfig ()
440
+ timeout_ms = configuration .get ("jobTimeoutMs" ) or configuration ["query" ].get (
441
+ "timeoutMs"
442
+ )
443
+
444
+ # Convert timeout_ms to seconds, ensuring a minimum of 0.1 seconds to avoid
445
+ # the program getting stuck on too-short timeouts.
446
+ timeout = max (int (timeout_ms ) * 1e-3 , 0.1 ) if timeout_ms else None
447
+
448
+ job_config = typing .cast (
449
+ bigquery .QueryJobConfig ,
450
+ bigquery .QueryJobConfig .from_api_repr (configuration ),
451
+ )
431
452
job_config .labels ["bigframes-api" ] = api_name
432
453
job_config .destination = temp_table
433
- job_config .use_query_cache = use_cache
434
454
435
455
try :
436
456
# Write to temp table to workaround BigQuery 10 GB query results
437
457
# limit. See: internal issue 303057336.
438
458
job_config .labels ["error_caught" ] = "true"
439
- _ , query_job = self ._start_query (query , job_config = job_config )
459
+ _ , query_job = self ._start_query (
460
+ query , job_config = job_config , timeout = timeout
461
+ )
440
462
return query_job .destination , query_job
441
463
except google .api_core .exceptions .BadRequest :
442
464
# Some SELECT statements still aren't compatible with cluster
443
465
# tables as the destination. For example, if the query has a
444
466
# top-level ORDER BY, this conflicts with our ability to cluster
445
467
# the table by the index column(s).
446
- _ , query_job = self ._start_query (query )
468
+ _ , query_job = self ._start_query (query , timeout = timeout )
447
469
return query_job .destination , query_job
448
470
449
471
def read_gbq_query (
@@ -452,8 +474,9 @@ def read_gbq_query(
452
474
* ,
453
475
index_col : Iterable [str ] | str = (),
454
476
columns : Iterable [str ] = (),
477
+ configuration : Optional [Dict ] = None ,
455
478
max_results : Optional [int ] = None ,
456
- use_cache : bool = True ,
479
+ use_cache : Optional [ bool ] = None ,
457
480
col_order : Iterable [str ] = (),
458
481
) -> dataframe .DataFrame :
459
482
"""Turn a SQL query into a DataFrame.
@@ -517,6 +540,7 @@ def read_gbq_query(
517
540
query = query ,
518
541
index_col = index_col ,
519
542
columns = columns ,
543
+ configuration = configuration ,
520
544
max_results = max_results ,
521
545
api_name = "read_gbq_query" ,
522
546
use_cache = use_cache ,
@@ -528,10 +552,34 @@ def _read_gbq_query(
528
552
* ,
529
553
index_col : Iterable [str ] | str = (),
530
554
columns : Iterable [str ] = (),
555
+ configuration : Optional [Dict ] = None ,
531
556
max_results : Optional [int ] = None ,
532
557
api_name : str = "read_gbq_query" ,
533
- use_cache : bool = True ,
558
+ use_cache : Optional [ bool ] = None ,
534
559
) -> dataframe .DataFrame :
560
+ configuration = _transform_read_gbq_configuration (configuration )
561
+
562
+ if "query" not in configuration :
563
+ configuration ["query" ] = {}
564
+
565
+ if "query" in configuration ["query" ]:
566
+ raise ValueError (
567
+ "The query statement must not be included in the " ,
568
+ "'configuration' because it is already provided as" ,
569
+ " a separate parameter." ,
570
+ )
571
+
572
+ if "useQueryCache" in configuration ["query" ]:
573
+ if use_cache is not None :
574
+ raise ValueError (
575
+ "'useQueryCache' in 'configuration' conflicts with"
576
+ " 'use_cache' parameter. Please specify only one."
577
+ )
578
+ else :
579
+ configuration ["query" ]["useQueryCache" ] = (
580
+ True if use_cache is None else use_cache
581
+ )
582
+
535
583
if isinstance (index_col , str ):
536
584
index_cols = [index_col ]
537
585
else :
@@ -541,7 +589,7 @@ def _read_gbq_query(
541
589
query ,
542
590
index_cols ,
543
591
api_name = api_name ,
544
- use_cache = use_cache ,
592
+ configuration = configuration ,
545
593
)
546
594
547
595
# If there was no destination table, that means the query must have
@@ -565,7 +613,7 @@ def _read_gbq_query(
565
613
index_col = index_cols ,
566
614
columns = columns ,
567
615
max_results = max_results ,
568
- use_cache = use_cache ,
616
+ use_cache = configuration [ "query" ][ "useQueryCache" ] ,
569
617
)
570
618
571
619
def read_gbq_table (
@@ -1656,13 +1704,14 @@ def _start_query(
1656
1704
sql : str ,
1657
1705
job_config : Optional [bigquery .job .QueryJobConfig ] = None ,
1658
1706
max_results : Optional [int ] = None ,
1707
+ timeout : Optional [float ] = None ,
1659
1708
) -> Tuple [bigquery .table .RowIterator , bigquery .QueryJob ]:
1660
1709
"""
1661
1710
Starts BigQuery query job and waits for results.
1662
1711
"""
1663
1712
job_config = self ._prepare_query_job_config (job_config )
1664
1713
return bigframes .session ._io .bigquery .start_query_with_client (
1665
- self .bqclient , sql , job_config , max_results
1714
+ self .bqclient , sql , job_config , max_results , timeout
1666
1715
)
1667
1716
1668
1717
def _start_query_ml_ddl (
@@ -1876,3 +1925,25 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringVa
1876
1925
# Escape backslashes and use backslash as delineator
1877
1926
escaped = typing .cast (ibis_types .StringColumn , result .fillna ("" )).replace ("\\ " , "\\ \\ " ) # type: ignore
1878
1927
return typing .cast (ibis_types .StringColumn , ibis .literal ("\\ " )).concat (escaped )
1928
+
1929
+
1930
+ def _transform_read_gbq_configuration (configuration : Optional [dict ]) -> dict :
1931
+ """
1932
+ For backwards-compatibility, convert any previously client-side only
1933
+ parameters such as timeoutMs to the property name expected by the REST API.
1934
+
1935
+ Makes a copy of configuration if changes are needed.
1936
+ """
1937
+
1938
+ if configuration is None :
1939
+ return {}
1940
+
1941
+ timeout_ms = configuration .get ("query" , {}).get ("timeoutMs" )
1942
+ if timeout_ms is not None :
1943
+ # Transform timeoutMs to an actual server-side configuration.
1944
+ # https://github.com/googleapis/python-bigquery-pandas/issues/479
1945
+ configuration = copy .deepcopy (configuration )
1946
+ del configuration ["query" ]["timeoutMs" ]
1947
+ configuration ["jobTimeoutMs" ] = timeout_ms
1948
+
1949
+ return configuration
0 commit comments