43
43
import pandas
44
44
import pyarrow as pa
45
45
46
- from bigframes .core import guid , local_data , utils
46
+ from bigframes .core import guid , identifiers , local_data , nodes , ordering , utils
47
47
import bigframes .core as core
48
48
import bigframes .core .blocks as blocks
49
49
import bigframes .core .schema as schemata
@@ -183,35 +183,59 @@ def read_pandas(
183
183
)
184
184
managed_data = local_data .ManagedArrowTable .from_pandas (prepared_df )
185
185
186
+ block = blocks .Block (
187
+ self .read_managed_data (managed_data , method = method , api_name = api_name ),
188
+ index_columns = idx_cols ,
189
+ column_labels = pandas_dataframe .columns ,
190
+ index_labels = pandas_dataframe .index .names ,
191
+ )
192
+ return dataframe .DataFrame (block )
193
+
194
+ def read_managed_data (
195
+ self ,
196
+ data : local_data .ManagedArrowTable ,
197
+ method : Literal ["load" , "stream" , "write" ],
198
+ api_name : str ,
199
+ ) -> core .ArrayValue :
200
+ offsets_col = guid .generate_guid ("upload_offsets_" )
186
201
if method == "load" :
187
- array_value = self .load_data (managed_data , api_name = api_name )
202
+ gbq_source = self .load_data (
203
+ data , offsets_col = offsets_col , api_name = api_name
204
+ )
188
205
elif method == "stream" :
189
- array_value = self .stream_data (managed_data )
206
+ gbq_source = self .stream_data (data , offsets_col = offsets_col )
190
207
elif method == "write" :
191
- array_value = self .write_data (managed_data )
208
+ gbq_source = self .write_data (data , offsets_col = offsets_col )
192
209
else :
193
210
raise ValueError (f"Unsupported read method { method } " )
194
211
195
- block = blocks .Block (
196
- array_value ,
197
- index_columns = idx_cols ,
198
- column_labels = pandas_dataframe .columns ,
199
- index_labels = pandas_dataframe .index .names ,
212
+ return core .ArrayValue .from_bq_data_source (
213
+ source = gbq_source ,
214
+ scan_list = nodes .ScanList (
215
+ tuple (
216
+ nodes .ScanItem (
217
+ identifiers .ColumnId (item .column ), item .dtype , item .column
218
+ )
219
+ for item in data .schema .items
220
+ )
221
+ ),
222
+ session = self ._session ,
200
223
)
201
- return dataframe .DataFrame (block )
202
224
203
225
def load_data (
204
- self , data : local_data .ManagedArrowTable , api_name : Optional [str ] = None
205
- ) -> core .ArrayValue :
226
+ self ,
227
+ data : local_data .ManagedArrowTable ,
228
+ offsets_col : str ,
229
+ api_name : Optional [str ] = None ,
230
+ ) -> nodes .BigqueryDataSource :
206
231
"""Load managed data into bigquery"""
207
- ordering_col = guid .generate_guid ("load_offsets_" )
208
232
209
233
# JSON support incomplete
210
234
for item in data .schema .items :
211
235
_validate_dtype_can_load (item .column , item .dtype )
212
236
213
237
schema_w_offsets = data .schema .append (
214
- schemata .SchemaItem (ordering_col , bigframes .dtypes .INT_DTYPE )
238
+ schemata .SchemaItem (offsets_col , bigframes .dtypes .INT_DTYPE )
215
239
)
216
240
bq_schema = schema_w_offsets .to_bigquery (_LOAD_JOB_TYPE_OVERRIDES )
217
241
@@ -222,13 +246,13 @@ def load_data(
222
246
job_config .labels = {"bigframes-api" : api_name }
223
247
224
248
load_table_destination = self ._storage_manager .create_temp_table (
225
- bq_schema , [ordering_col ]
249
+ bq_schema , [offsets_col ]
226
250
)
227
251
228
252
buffer = io .BytesIO ()
229
253
data .to_parquet (
230
254
buffer ,
231
- offsets_col = ordering_col ,
255
+ offsets_col = offsets_col ,
232
256
geo_format = "wkt" ,
233
257
duration_type = "duration" ,
234
258
json_type = "string" ,
@@ -240,23 +264,24 @@ def load_data(
240
264
self ._start_generic_job (load_job )
241
265
# must get table metadata after load job for accurate metadata
242
266
destination_table = self ._bqclient .get_table (load_table_destination )
243
- return core .ArrayValue .from_table (
244
- table = destination_table ,
245
- schema = schema_w_offsets ,
246
- session = self ._session ,
247
- offsets_col = ordering_col ,
248
- n_rows = data .data .num_rows ,
249
- ).drop_columns ([ordering_col ])
267
+ return nodes .BigqueryDataSource (
268
+ destination_table ,
269
+ ordering = ordering .TotalOrdering .from_offset_col (offsets_col ),
270
+ n_rows = destination_table .num_rows ,
271
+ )
250
272
251
- def stream_data (self , data : local_data .ManagedArrowTable ) -> core .ArrayValue :
273
+ def stream_data (
274
+ self ,
275
+ data : local_data .ManagedArrowTable ,
276
+ offsets_col : str ,
277
+ ) -> nodes .BigqueryDataSource :
252
278
"""Load managed data into bigquery"""
253
- ordering_col = guid .generate_guid ("stream_offsets_" )
254
279
schema_w_offsets = data .schema .append (
255
- schemata .SchemaItem (ordering_col , bigframes .dtypes .INT_DTYPE )
280
+ schemata .SchemaItem (offsets_col , bigframes .dtypes .INT_DTYPE )
256
281
)
257
282
bq_schema = schema_w_offsets .to_bigquery (_STREAM_JOB_TYPE_OVERRIDES )
258
283
load_table_destination = self ._storage_manager .create_temp_table (
259
- bq_schema , [ordering_col ]
284
+ bq_schema , [offsets_col ]
260
285
)
261
286
262
287
rows = data .itertuples (
@@ -279,20 +304,21 @@ def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue:
279
304
table = destination_table ,
280
305
schema = schema_w_offsets ,
281
306
session = self ._session ,
282
- offsets_col = ordering_col ,
307
+ offsets_col = offsets_col ,
283
308
n_rows = data .data .num_rows ,
284
- ).drop_columns ([ordering_col ])
309
+ ).drop_columns ([offsets_col ])
285
310
286
- def write_data (self , data : local_data .ManagedArrowTable ) -> core .ArrayValue :
311
+ def write_data (
312
+ self ,
313
+ data : local_data .ManagedArrowTable ,
314
+ offsets_col : str ,
315
+ ) -> nodes .BigqueryDataSource :
287
316
"""Load managed data into bigquery"""
288
- ordering_col = guid .generate_guid ("stream_offsets_" )
289
317
schema_w_offsets = data .schema .append (
290
- schemata .SchemaItem (ordering_col , bigframes .dtypes .INT_DTYPE )
318
+ schemata .SchemaItem (offsets_col , bigframes .dtypes .INT_DTYPE )
291
319
)
292
320
bq_schema = schema_w_offsets .to_bigquery (_STREAM_JOB_TYPE_OVERRIDES )
293
- bq_table_ref = self ._storage_manager .create_temp_table (
294
- bq_schema , [ordering_col ]
295
- )
321
+ bq_table_ref = self ._storage_manager .create_temp_table (bq_schema , [offsets_col ])
296
322
297
323
requested_stream = bq_storage_types .stream .WriteStream ()
298
324
requested_stream .type_ = bq_storage_types .stream .WriteStream .Type .COMMITTED # type: ignore
@@ -304,7 +330,7 @@ def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue:
304
330
305
331
def request_gen () -> Generator [bq_storage_types .AppendRowsRequest , None , None ]:
306
332
schema , batches = data .to_arrow (
307
- offsets_col = ordering_col , duration_type = "int"
333
+ offsets_col = offsets_col , duration_type = "int"
308
334
)
309
335
offset = 0
310
336
for batch in batches :
@@ -334,9 +360,9 @@ def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]:
334
360
table = destination_table ,
335
361
schema = schema_w_offsets ,
336
362
session = self ._session ,
337
- offsets_col = ordering_col ,
363
+ offsets_col = offsets_col ,
338
364
n_rows = data .data .num_rows ,
339
- ).drop_columns ([ordering_col ])
365
+ ).drop_columns ([offsets_col ])
340
366
341
367
def _start_generic_job (self , job : formatting_helpers .GenericJob ):
342
368
if bigframes .options .display .progress_bar is not None :
@@ -533,7 +559,7 @@ def read_gbq_table(
533
559
if not primary_key :
534
560
array_value = array_value .order_by (
535
561
[
536
- bigframes . core . ordering .OrderingExpression (
562
+ ordering .OrderingExpression (
537
563
bigframes .operations .RowKey ().as_expr (
538
564
* (id for id in array_value .column_ids )
539
565
),
0 commit comments