25
25
import google .cloud .bigquery .job as bq_job
26
26
import google .cloud .bigquery .table as bq_table
27
27
import google .cloud .bigquery_storage_v1
28
+ import pyarrow as pa
28
29
29
30
import bigframes .core
30
- from bigframes .core import rewrite
31
- import bigframes .core .compile
31
+ from bigframes .core import compile , local_data , pyarrow_utils , rewrite
32
32
import bigframes .core .guid
33
33
import bigframes .core .nodes as nodes
34
34
import bigframes .core .ordering as order
@@ -70,9 +70,6 @@ def __init__(
70
70
):
71
71
self .bqclient = bqclient
72
72
self .storage_manager = storage_manager
73
- self .compiler : bigframes .core .compile .SQLCompiler = (
74
- bigframes .core .compile .SQLCompiler ()
75
- )
76
73
self .strictly_ordered : bool = strictly_ordered
77
74
self ._cached_executions : weakref .WeakKeyDictionary [
78
75
nodes .BigFrameNode , nodes .BigFrameNode
@@ -97,8 +94,11 @@ def to_sql(
97
94
) -> str :
98
95
if offset_column :
99
96
array_value , _ = array_value .promote_offsets ()
100
- node = self .logical_plan (array_value .node ) if enable_cache else array_value .node
101
- return self .compiler .compile (node , ordered = ordered )
97
+ node = (
98
+ self .simplify_plan (array_value .node ) if enable_cache else array_value .node
99
+ )
100
+ compiled = compile .compile_sql (compile .CompileRequest (node , sort_rows = ordered ))
101
+ return compiled .sql
102
102
103
103
def execute (
104
104
self ,
@@ -115,7 +115,6 @@ def execute(
115
115
if bigframes .options .compute .enable_multi_query_execution :
116
116
self ._simplify_with_caching (array_value )
117
117
118
- plan = self .logical_plan (array_value .node )
119
118
# Use explicit destination to avoid 10GB limit of temporary table
120
119
destination_table = (
121
120
self .storage_manager .create_temp_table (
@@ -125,7 +124,7 @@ def execute(
125
124
else None
126
125
)
127
126
return self ._execute_plan (
128
- plan ,
127
+ array_value . node ,
129
128
ordered = ordered ,
130
129
page_size = page_size ,
131
130
max_results = max_results ,
@@ -224,7 +223,7 @@ def peek(
224
223
"""
225
224
A 'peek' efficiently accesses a small number of rows in the dataframe.
226
225
"""
227
- plan = self .logical_plan (array_value .node )
226
+ plan = self .simplify_plan (array_value .node )
228
227
if not tree_properties .can_fast_peek (plan ):
229
228
msg = bfe .format_message ("Peeking this value cannot be done efficiently." )
230
229
warnings .warn (msg )
@@ -240,7 +239,7 @@ def peek(
240
239
)
241
240
242
241
return self ._execute_plan (
243
- plan , ordered = False , destination = destination_table , peek = n_rows
242
+ array_value . node , ordered = False , destination = destination_table , peek = n_rows
244
243
)
245
244
246
245
def cached (
@@ -329,10 +328,10 @@ def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue):
329
328
# Once rewriting is available, will want to rewrite before
330
329
# evaluating execution cost.
331
330
return tree_properties .is_trivially_executable (
332
- self .logical_plan (array_value .node )
331
+ self .simplify_plan (array_value .node )
333
332
)
334
333
335
- def logical_plan (self , root : nodes .BigFrameNode ) -> nodes .BigFrameNode :
334
+ def simplify_plan (self , root : nodes .BigFrameNode ) -> nodes .BigFrameNode :
336
335
"""
337
336
Apply universal logical simplifications that are helpful regardless of engine.
338
337
"""
@@ -345,29 +344,35 @@ def _cache_with_cluster_cols(
345
344
self , array_value : bigframes .core .ArrayValue , cluster_cols : Sequence [str ]
346
345
):
347
346
"""Executes the query and uses the resulting table to rewrite future executions."""
348
-
349
- sql , schema , ordering_info = self .compiler .compile_raw (
350
- self .logical_plan (array_value .node )
347
+ plan = self .simplify_plan (array_value .node )
348
+ compiled = compile .compile_sql (
349
+ compile .CompileRequest (
350
+ plan , sort_rows = False , materialize_all_order_keys = True
351
+ )
351
352
)
352
353
tmp_table = self ._sql_as_cached_temp_table (
353
- sql ,
354
- schema ,
355
- cluster_cols = bq_io .select_cluster_cols (schema , cluster_cols ),
354
+ compiled . sql ,
355
+ compiled . sql_schema ,
356
+ cluster_cols = bq_io .select_cluster_cols (compiled . sql_schema , cluster_cols ),
356
357
)
357
358
cached_replacement = array_value .as_cached (
358
359
cache_table = self .bqclient .get_table (tmp_table ),
359
- ordering = ordering_info ,
360
+ ordering = compiled . row_order ,
360
361
).node
361
362
self ._cached_executions [array_value .node ] = cached_replacement
362
363
363
364
def _cache_with_offsets (self , array_value : bigframes .core .ArrayValue ):
364
365
"""Executes the query and uses the resulting table to rewrite future executions."""
365
366
offset_column = bigframes .core .guid .generate_guid ("bigframes_offsets" )
366
367
w_offsets , offset_column = array_value .promote_offsets ()
367
- sql = self .compiler .compile (self .logical_plan (w_offsets .node ), ordered = False )
368
+ compiled = compile .compile_sql (
369
+ compile .CompileRequest (
370
+ array_value .node , sort_rows = False , materialize_all_order_keys = True
371
+ )
372
+ )
368
373
369
374
tmp_table = self ._sql_as_cached_temp_table (
370
- sql ,
375
+ compiled . sql ,
371
376
w_offsets .schema .to_bigquery (),
372
377
cluster_cols = [offset_column ],
373
378
)
@@ -401,7 +406,7 @@ def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue):
401
406
# Apply existing caching first
402
407
for _ in range (MAX_SUBTREE_FACTORINGS ):
403
408
if (
404
- self .logical_plan (array_value .node ).planning_complexity
409
+ self .simplify_plan (array_value .node ).planning_complexity
405
410
< QUERY_COMPLEXITY_LIMIT
406
411
):
407
412
return
@@ -458,8 +463,8 @@ def _validate_result_schema(
458
463
bq_schema : list [bigquery .SchemaField ],
459
464
):
460
465
actual_schema = _sanitize (tuple (bq_schema ))
461
- ibis_schema = bigframes . core . compile .test_only_ibis_inferred_schema (
462
- self .logical_plan (array_value .node )
466
+ ibis_schema = compile .test_only_ibis_inferred_schema (
467
+ self .simplify_plan (array_value .node )
463
468
).to_bigquery ()
464
469
internal_schema = _sanitize (array_value .schema .to_bigquery ())
465
470
if not bigframes .features .PANDAS_VERSIONS .is_arrow_list_dtype_usable :
@@ -477,7 +482,7 @@ def _validate_result_schema(
477
482
478
483
def _execute_plan (
479
484
self ,
480
- plan : nodes .BigFrameNode ,
485
+ root : nodes .BigFrameNode ,
481
486
ordered : bool ,
482
487
page_size : Optional [int ] = None ,
483
488
max_results : Optional [int ] = None ,
@@ -490,7 +495,9 @@ def _execute_plan(
490
495
# TODO: Allow page_size and max_results by rechunking/truncating results
491
496
if (not page_size ) and (not max_results ) and (not destination ) and (not peek ):
492
497
for semi_executor in self ._semi_executors :
493
- maybe_result = semi_executor .execute (plan , ordered = ordered )
498
+ maybe_result = semi_executor .execute (
499
+ self .simplify_plan (root ), ordered = ordered
500
+ )
494
501
if maybe_result :
495
502
return maybe_result
496
503
@@ -500,31 +507,34 @@ def _execute_plan(
500
507
# Use explicit destination to avoid 10GB limit of temporary table
501
508
if destination is not None :
502
509
job_config .destination = destination
503
- sql = self .compiler .compile (plan , ordered = ordered , limit = peek )
510
+ compiled = compile .compile_sql (
511
+ compile .CompileRequest (
512
+ self .simplify_plan (root ), sort_rows = ordered , peek_count = peek
513
+ )
514
+ )
504
515
iterator , query_job = self ._run_execute_query (
505
- sql = sql ,
516
+ sql = compiled . sql ,
506
517
job_config = job_config ,
507
518
page_size = page_size ,
508
519
max_results = max_results ,
509
520
query_with_job = (destination is not None ),
510
521
)
511
522
512
523
# Though we provide the read client, iterator may or may not use it based on what is efficient for the result
513
- def iterator_supplier ():
514
- # Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154
515
- if iterator ._page_size is not None or iterator .max_results is not None :
516
- return iterator .to_arrow_iterable (bqstorage_client = None )
517
- else :
518
- return iterator .to_arrow_iterable (
519
- bqstorage_client = self .bqstoragereadclient
520
- )
524
+ # Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154
525
+ if iterator ._page_size is not None or iterator .max_results is not None :
526
+ batch_iterator = iterator .to_arrow_iterable (bqstorage_client = None )
527
+ else :
528
+ batch_iterator = iterator .to_arrow_iterable (
529
+ bqstorage_client = self .bqstoragereadclient
530
+ )
521
531
522
532
if query_job :
523
- size_bytes = self .bqclient .get_table (query_job .destination ). num_bytes
533
+ table = self .bqclient .get_table (query_job .destination )
524
534
else :
525
- size_bytes = None
535
+ table = None
526
536
527
- if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES :
537
+ if ( table is not None ) and ( table . num_bytes or 0 ) >= MAX_SMALL_RESULT_BYTES :
528
538
msg = bfe .format_message (
529
539
"The query result size has exceeded 10 GB. In BigFrames 2.0 and "
530
540
"later, you might need to manually set `allow_large_results=True` in "
@@ -536,14 +546,63 @@ def iterator_supplier():
536
546
# Do not execute these validations outside of testing suite.
537
547
if "PYTEST_CURRENT_TEST" in os .environ :
538
548
self ._validate_result_schema (
539
- bigframes .core .ArrayValue (plan ), iterator .schema
549
+ bigframes .core .ArrayValue (root ), iterator .schema
550
+ )
551
+
552
+ # if destination is set, this is an externally managed table, which may mutated, cannot use as cache
553
+ if (
554
+ (destination is not None )
555
+ and (table is not None )
556
+ and (compiled .row_order is not None )
557
+ and (peek is None )
558
+ ):
559
+ # Assumption: GBQ cached table uses field name as bq column name
560
+ scan_list = nodes .ScanList (
561
+ tuple (
562
+ nodes .ScanItem (field .id , field .dtype , field .id .name )
563
+ for field in root .fields
564
+ )
565
+ )
566
+ cached_replacement = nodes .CachedTableNode (
567
+ source = nodes .BigqueryDataSource (
568
+ nodes .GbqTable .from_table (
569
+ table , columns = tuple (f .id .name for f in root .fields )
570
+ ),
571
+ ordering = compiled .row_order ,
572
+ n_rows = table .num_rows ,
573
+ ),
574
+ scan_list = scan_list ,
575
+ table_session = root .session ,
576
+ original_node = root ,
577
+ )
578
+ self ._cached_executions [root ] = cached_replacement
579
+ else : # no explicit destination, can maybe peek iterator
580
+ # Assumption: GBQ cached table uses field name as bq column name
581
+ scan_list = nodes .ScanList (
582
+ tuple (
583
+ nodes .ScanItem (field .id , field .dtype , field .id .name )
584
+ for field in root .fields
585
+ )
586
+ )
587
+ # Will increase when have auto-upload, 5000 is most want to inline
588
+ batch_iterator , batches = pyarrow_utils .peek_batches (
589
+ batch_iterator , max_bytes = 5000
540
590
)
591
+ if batches :
592
+ local_cached = nodes .ReadLocalNode (
593
+ local_data_source = local_data .ManagedArrowTable .from_pyarrow (
594
+ pa .Table .from_batches (batches )
595
+ ),
596
+ scan_list = scan_list ,
597
+ session = root .session ,
598
+ )
599
+ self ._cached_executions [root ] = local_cached
541
600
542
601
return executor .ExecuteResult (
543
- arrow_batches = iterator_supplier ,
544
- schema = plan .schema ,
602
+ arrow_batches = batch_iterator ,
603
+ schema = root .schema ,
545
604
query_job = query_job ,
546
- total_bytes = size_bytes ,
605
+ total_bytes = table . num_bytes if table else None ,
547
606
total_rows = iterator .total_rows ,
548
607
)
549
608
0 commit comments