refactor: Simplify local arrow data management (#1591)

TrevorBergeron · web-flow · commit 9e0b3834593b · 2025-04-04T14:37:21.000-07:00
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
@@ -16,15 +16,13 @@
 from dataclasses import dataclass
 import datetime
 import functools
-import io
 import typing
 from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
 import warnings
 
 import google.cloud.bigquery
 import pandas
 import pyarrow as pa
-import pyarrow.feather as pa_feather
 
 import bigframes.core.expression as ex
 import bigframes.core.guid
@@ -63,21 +61,16 @@ def from_pyarrow(cls, arrow_table: pa.Table, session: Session):
         adapted_table = local_data.adapt_pa_table(arrow_table)
         schema = local_data.arrow_schema_to_bigframes(adapted_table.schema)
 
-        iobytes = io.BytesIO()
-        pa_feather.write_feather(adapted_table, iobytes)
-        # Scan all columns by default, we define this list as it can be pruned while preserving source_def
         scan_list = nodes.ScanList(
             tuple(
                 nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column)
                 for item in schema.items
             )
         )
-
+        data_source = local_data.ManagedArrowTable(adapted_table, schema)
         node = nodes.ReadLocalNode(
-            iobytes.getvalue(),
-            data_schema=schema,
+            data_source,
             session=session,
-            n_rows=arrow_table.num_rows,
             scan_list=scan_list,
         )
         return cls(node)
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -26,15 +26,15 @@
 from bigframes_vendored.ibis.expr.operations import window as ibis_expr_window
 import bigframes_vendored.ibis.expr.operations as ibis_ops
 import bigframes_vendored.ibis.expr.types as ibis_types
-import pandas
+from google.cloud import bigquery
+import pyarrow as pa
 
 import bigframes.core.compile.aggregate_compiler as agg_compiler
 import bigframes.core.compile.googlesql
 import bigframes.core.compile.ibis_types
 import bigframes.core.compile.scalar_op_compiler as op_compilers
 import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler
 import bigframes.core.expression as ex
-import bigframes.core.guid
 from bigframes.core.ordering import OrderingExpression
 import bigframes.core.sql
 from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
@@ -279,11 +279,8 @@ def _reproject_to_table(self) -> UnorderedIR:
         )
 
     @classmethod
-    def from_pandas(
-        cls,
-        pd_df: pandas.DataFrame,
-        scan_cols: bigframes.core.nodes.ScanList,
-        offsets: typing.Optional[str] = None,
+    def from_polars(
+        cls, pa_table: pa.Table, schema: Sequence[bigquery.SchemaField]
     ) -> UnorderedIR:
         # TODO: add offsets
         """
@@ -292,37 +289,16 @@ def from_pandas(
         Assumed that the dataframe has unique string column names and bigframes-suppported
         dtypes.
         """
+        import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
 
-        # ibis memtable cannot handle NA, must convert to None
-        # this destroys the schema however
-        ibis_values = pd_df.astype("object").where(pandas.notnull(pd_df), None)  # type: ignore
-        if offsets:
-            ibis_values = ibis_values.assign(**{offsets: range(len(pd_df))})
         # derive the ibis schema from the original pandas schema
-        ibis_schema = [
-            (
-                local_label,
-                bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(dtype),
-            )
-            for id, dtype, local_label in scan_cols.items
-        ]
-        if offsets:
-            ibis_schema.append((offsets, ibis_dtypes.int64))
-
         keys_memtable = bigframes_vendored.ibis.memtable(
-            ibis_values, schema=bigframes_vendored.ibis.schema(ibis_schema)
+            pa_table,
+            schema=third_party_ibis_bqtypes.BigQuerySchema.to_ibis(list(schema)),
         )
-
-        columns = [
-            keys_memtable[local_label].name(col_id.sql)
-            for col_id, _, local_label in scan_cols.items
-        ]
-        if offsets:
-            columns.append(keys_memtable[offsets].name(offsets))
-
         return cls(
             keys_memtable,
-            columns=columns,
+            columns=tuple(keys_memtable[key] for key in keys_memtable.columns),
         )
 
     def join(
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -14,24 +14,20 @@
 from __future__ import annotations
 
 import functools
-import io
 import typing
 
 import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery
 import bigframes_vendored.ibis.expr.api as ibis_api
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
 import bigframes_vendored.ibis.expr.types as ibis_types
 import google.cloud.bigquery
-import pandas as pd
+import pyarrow as pa
 
 from bigframes import dtypes, operations
-from bigframes.core import utils
 import bigframes.core.compile.compiled as compiled
 import bigframes.core.compile.concat as concat_impl
 import bigframes.core.compile.explode
-import bigframes.core.compile.ibis_types
 import bigframes.core.compile.scalar_op_compiler as compile_scalar
-import bigframes.core.compile.schema_translator
 import bigframes.core.nodes as nodes
 import bigframes.core.ordering as bf_ordering
 import bigframes.core.rewrite as rewrites
@@ -161,19 +157,22 @@ def compile_fromrange(
 
 @_compile_node.register
 def compile_readlocal(node: nodes.ReadLocalNode, *args):
-    array_as_pd = pd.read_feather(
-        io.BytesIO(node.feather_bytes),
-        columns=[item.source_id for item in node.scan_list.items],
-    )
-
-    # Convert timedeltas to microseconds for compatibility with BigQuery
-    _ = utils.replace_timedeltas_with_micros(array_as_pd)
-
     offsets = node.offsets_col.sql if node.offsets_col else None
-    return compiled.UnorderedIR.from_pandas(
-        array_as_pd, node.scan_list, offsets=offsets
+    pa_table = node.local_data_source.data
+    bq_schema = node.schema.to_bigquery()
+
+    pa_table = pa_table.select(list(item.source_id for item in node.scan_list.items))
+    pa_table = pa_table.rename_columns(
+        {item.source_id: item.id.sql for item in node.scan_list.items}
     )
 
+    if offsets:
+        pa_table = pa_table.append_column(
+            offsets, pa.array(range(pa_table.num_rows), type=pa.int64())
+        )
+        bq_schema = (*bq_schema, google.cloud.bigquery.SchemaField(offsets, "INT64"))
+    return compiled.UnorderedIR.from_polars(pa_table, bq_schema)
+
 
 @_compile_node.register
 def compile_readtable(node: nodes.ReadTableNode, *args):
diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
@@ -205,11 +205,10 @@ def compile_readlocal(self, node: nodes.ReadLocalNode):
         cols_to_read = {
             scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items
         }
-        return (
-            pl.read_ipc(node.feather_bytes, columns=list(cols_to_read.keys()))
-            .lazy()
-            .rename(cols_to_read)
-        )
+        lazy_frame = cast(
+            pl.DataFrame, pl.from_arrow(node.local_data_source.data)
+        ).lazy()
+        return lazy_frame.select(cols_to_read.keys()).rename(cols_to_read)
 
     @compile_node.register
     def compile_filter(self, node: nodes.FilterNode):
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -16,12 +16,37 @@
 
 from __future__ import annotations
 
+import dataclasses
+import functools
+import uuid
+
 import pyarrow as pa
 
 import bigframes.core.schema as schemata
 import bigframes.dtypes
 
 
+@dataclasses.dataclass(frozen=True)
+class LocalTableMetadata:
+    total_bytes: int
+    row_count: int
+
+    @classmethod
+    def from_arrow(cls, table: pa.Table):
+        return cls(total_bytes=table.nbytes, row_count=table.num_rows)
+
+
+@dataclasses.dataclass(frozen=True)
+class ManagedArrowTable:
+    data: pa.Table = dataclasses.field(hash=False)
+    schema: schemata.ArraySchema = dataclasses.field(hash=False)
+    id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4)
+
+    @functools.cached_property
+    def metadata(self):
+        return LocalTableMetadata.from_arrow(self.data)
+
+
 def arrow_schema_to_bigframes(arrow_schema: pa.Schema) -> schemata.ArraySchema:
     """Infer the corresponding bigframes schema given a pyarrow schema."""
     schema_items = tuple(
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -24,12 +24,10 @@
 
 import google.cloud.bigquery as bq
 
-from bigframes.core import identifiers
+from bigframes.core import identifiers, local_data
 from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field
 import bigframes.core.expression as ex
-import bigframes.core.guid
 from bigframes.core.ordering import OrderingExpression
-import bigframes.core.schema as schemata
 import bigframes.core.slices as slices
 import bigframes.core.window_spec as window
 import bigframes.dtypes
@@ -579,11 +577,8 @@ class ScanList:
 
 @dataclasses.dataclass(frozen=True, eq=False)
 class ReadLocalNode(LeafNode):
-    # TODO: Combine feather_bytes, data_schema, n_rows into a LocalDataDef struct
     # TODO: Track nullability for local data
-    feather_bytes: bytes
-    data_schema: schemata.ArraySchema
-    n_rows: int
+    local_data_source: local_data.ManagedArrowTable
     # Mapping of local ids to bfet id.
     scan_list: ScanList
     # Offsets are generated only if this is non-null
@@ -623,7 +618,7 @@ def explicitly_ordered(self) -> bool:
 
     @property
     def row_count(self) -> typing.Optional[int]:
-        return self.n_rows
+        return self.local_data_source.metadata.row_count
 
     @property
     def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import datetime
 import decimal
 import math
 import re
@@ -478,6 +479,11 @@ def visit_NonNullLiteral(self, op, *, value, dtype):
             return sge.convert(str(value))
 
         elif dtype.is_int64():
+            # allows directly using values out of a duration arrow array
+            if isinstance(value, datetime.timedelta):
+                value = (
+                    (value.days * 3600 * 24) + value.seconds
+                ) * 1_000_000 + value.microseconds
             return sge.convert(np.int64(value))
         return None
 
@@ -1024,7 +1030,7 @@ def visit_InMemoryTable(self, op, *, name, schema, data):
         # Avoid creating temp tables for small data, which is how memtable is
         # used in BigQuery DataFrames. Inspired by:
         # https://github.com/ibis-project/ibis/blob/efa6fb72bf4c790450d00a926d7bd809dade5902/ibis/backends/druid/compiler.py#L95
-        tuples = data.to_frame().itertuples(index=False)
+        rows = data.to_pyarrow(schema=None).to_pylist()  # type: ignore
         quoted = self.quoted
         columns = [sg.column(col, quoted=quoted) for col in schema.names]
         array_expr = sge.DataType(
@@ -1042,10 +1048,10 @@ def visit_InMemoryTable(self, op, *, name, schema, data):
             sge.Struct(
                 expressions=tuple(
                     self.visit_Literal(None, value=value, dtype=type_)
-                    for value, type_ in zip(row, schema.types)
+                    for value, type_ in zip(row.values(), schema.types)
                 )
             )
-            for row in tuples
+            for row in rows
         ]
         expr = sge.Unnest(
             expressions=[