feat: (Series|DataFrame).explode

chelsea-lin · chelsea-lin · commit 475eeea30f5d · 2024-04-01T17:54:15.000Z
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -401,6 +401,19 @@ def join(
             return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node))
         return ArrayValue(join_node)
 
+    def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
+        column_ids = [
+            column_id
+            for column_id in column_ids
+            if bigframes.dtypes.is_array_like(self.get_column_type(column_id))
+        ]
+        if len(column_ids) == 0:
+            return ArrayValue(self.node)
+        else:
+            return ArrayValue(
+                nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
+            )
+
     def _uniform_sampling(self, fraction: float) -> ArrayValue:
         """Sampling the table on given fraction.
 
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1179,6 +1179,30 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
             index_labels=self.column_labels.names,
         )
 
+    def explode(
+        self,
+        column_ids: typing.Sequence[str],
+        ignore_index: Optional[bool],
+    ) -> Block:
+        expr = self.expr.explode(column_ids)
+        # TODO: check multi-index
+        # TODO: check ignore_index works if column_ids is empty.
+        if ignore_index:
+            new_index_col_id = guid.generate_guid("explode_index_")
+            expr = expr.promote_offsets(new_index_col_id)
+            expr = expr.drop_columns(self.index_columns)
+            index_columns = [new_index_col_id]
+            index_labels = [None]
+        else:
+            index_columns = list(self.index_columns)
+            index_labels = self.column_labels.names
+        return Block(
+            expr,
+            column_labels=self.column_labels,
+            index_columns=index_columns,
+            index_labels=index_labels,
+        )
+
     def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
         """
         Gets a standard set of stats to preemptively fetch for a column if
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -20,6 +20,7 @@
 import typing
 from typing import Collection, Iterable, Literal, Optional, Sequence
 
+import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 import ibis
 import ibis.backends.bigquery as ibis_bigquery
 import ibis.common.deferred  # type: ignore
@@ -502,6 +503,16 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
             columns=columns,
         )
 
+    def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
+        """TODO"""
+        # TODO: HERE
+        table = self._to_ibis_expr()
+        columns = [table[column_name] for column_name in self._column_names]
+        return UnorderedIR(
+            table,
+            columns=columns,
+        )
+
     ## Helpers
     def _set_or_replace_by_id(
         self, id: str, new_value: ibis_types.Value
@@ -719,6 +730,89 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
             ordering=self._ordering,
         )
 
+    def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
+        """TODO"""
+        table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
+
+        offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
+        offset_array = (
+            # TODO: check when all columns are empty.
+            vendored_ibis_ops.GenerateArray(
+                ibis.greatest(
+                    0,
+                    ibis.least(
+                        *[table[column_id].length() - 1 for column_id in column_ids]
+                    ),
+                )
+            )
+            .to_expr()
+            .name(offset_array_id),
+        )
+        table_w_offset = table.select(
+            offset_array,
+            *self._column_names,
+            *self._hidden_ordering_column_names,
+        )
+
+        # TODO: file ibis bug, when column name is `array`
+        zip_array_id = bigframes.core.guid.generate_guid("zip_array_")
+        zip_array = (
+            table_w_offset[offset_array_id]
+            .zip(*[table_w_offset[column_id] for column_id in column_ids])
+            .name(zip_array_id)
+        )
+        table_w_zip_array = table_w_offset.select(
+            zip_array,
+            *self._column_names,
+            *self._hidden_ordering_column_names,
+        )
+
+        unnest_array_id = bigframes.core.guid.generate_guid("unnest_array_")
+        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
+
+        unnest_array = table_w_zip_array[zip_array_id].unnest().name(unnest_array_id)
+        unnested_columns = [
+            unnest_array[f"f{index+2}"].name(column_id)
+            for index, column_id in zip(range(len(column_ids)), column_ids)
+        ]
+        other_columns = [
+            column_id for column_id in self._column_names if column_id not in column_ids
+        ]
+        table_w_unnest = table_w_zip_array.select(
+            unnest_array["f1"].name(unnest_offset_id),
+            *unnested_columns,
+            *other_columns,
+            *self._hidden_ordering_column_names,
+        )
+        # print(ibis.to_sql(table_w_unnest))
+
+        columns = [table_w_unnest[column_name] for column_name in self._column_names]
+        hidden_ordering_columns = [
+            *[
+                table_w_unnest[column_name]
+                for column_name in self._hidden_ordering_column_names
+            ],
+            table_w_unnest[unnest_offset_id],
+        ]
+        ordering = ExpressionOrdering(
+            ordering_value_columns=tuple(
+                [
+                    *self._ordering.ordering_value_columns,
+                    ascending_over(unnest_offset_id),
+                ]
+            ),
+            total_ordering_columns=frozenset(
+                [*self._ordering.total_ordering_columns, unnest_offset_id]
+            ),
+        )
+
+        return OrderedIR(
+            table_w_unnest,
+            columns=columns,
+            hidden_ordering_columns=hidden_ordering_columns,
+            ordering=ordering,
+        )
+
     def promote_offsets(self, col_id: str) -> OrderedIR:
         """
         Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True):
     )
 
 
+@_compile_node.register
+def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True):
+    return compile_node(node.child, ordered).explode(node.column_ids)
+
+
 @_compile_node.register
 def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True):
     return compile_node(node.child, ordered)._uniform_sampling(node.fraction)
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -484,3 +484,11 @@ def row_preserving(self) -> bool:
 
     def __hash__(self):
         return self._node_hash
+
+
+@dataclass(frozen=True)
+class ExplodeNode(UnaryNode):
+    column_ids: typing.Tuple[str, ...]
+
+    def __hash__(self):
+        return self._node_hash
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2518,6 +2518,21 @@ def sample(
             )[0]
         )
 
+    def explode(
+        self,
+        column: str | typing.Sequence[str],
+        *,
+        ignore_index: Optional[bool] = False,
+    ) -> DataFrame:
+        columns = list(column) if utils.is_list_like(column) else [column]
+        if not columns:
+            raise ValueError("column must be nonempty")
+        if len(column) > len(set(column)):
+            raise ValueError("column must be unique")
+        return DataFrame(
+            self._block.explode(column_ids=columns, ignore_index=ignore_index)
+        )
+
     def _split(
         self,
         ns: Iterable[int] = (),
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -129,16 +129,19 @@ def is_string_like(type: ExpressionType) -> bool:
 
 
 def is_array_like(type: ExpressionType) -> bool:
-    if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType):
-        return True
-    else:
-        return type in (STRING_DTYPE, BYTES_DTYPE)
+    return isinstance(type, pd.ArrowDtype) and isinstance(
+        type.pyarrow_dtype, pa.ListType
+    )
 
 
 def is_numeric(type: ExpressionType) -> bool:
     return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
 
 
+def is_iterable(type: ExpressionType) -> bool:
+    return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type)
+
+
 def is_comparable(type: ExpressionType) -> bool:
     return (type is not None) and (type not in UNORDERED_DTYPES)
 
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -212,7 +212,7 @@ def create_binary_op(
 len_op = create_unary_op(
     name="len",
     type_signature=op_typing.FixedOutputType(
-        dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like"
+        dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"
     ),
 )
 reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1546,6 +1546,17 @@ def sample(
             )[0]
         )
 
+    def explode(
+        self,
+        *,
+        ignore_index: Optional[bool] = False,
+    ) -> Series:
+        return Series(
+            self._block.explode(
+                column_ids=[self._value_column], ignore_index=ignore_index
+            )
+        )
+
     def __array_ufunc__(
         self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs
     ) -> Series:
diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
@@ -26,11 +26,17 @@ def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString):
     return f"TO_JSON_STRING({arg})"
 
 
+def _generate_array(translator, op: vendored_ibis_ops.GenerateArray):
+    arg = translator.translate(op.arg)
+    return f"GENERATE_ARRAY(0, {arg})"
+
+
 patched_ops = {
     vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles,  # type:ignore
     vendored_ibis_ops.FirstNonNullValue: _first_non_null_value,  # type:ignore
     vendored_ibis_ops.LastNonNullValue: _last_non_null_value,  # type:ignore
     vendored_ibis_ops.ToJsonString: _to_json_string,  # type:ignore
+    vendored_ibis_ops.GenerateArray: _generate_array,  # type:ignore
 }
 
 OPERATION_REGISTRY.update(patched_ops)
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
@@ -2,5 +2,6 @@
 from __future__ import annotations
 
 from bigframes_vendored.ibis.expr.operations.analytic import *  # noqa: F401 F403
+from bigframes_vendored.ibis.expr.operations.generic import *  # noqa: F401 F403
 from bigframes_vendored.ibis.expr.operations.json import *  # noqa: F401 F403
 from bigframes_vendored.ibis.expr.operations.reductions import *  # noqa: F401 F403
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/generic.py b/third_party/bigframes_vendored/ibis/expr/operations/generic.py
@@ -0,0 +1,10 @@
+# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/generic.py
+from __future__ import annotations
+
+import ibis.expr.datatypes as dt
+from ibis.expr.operations.core import Unary
+
+
+# TODO: add this function to ibis
+class GenerateArray(Unary):
+    dtype = dt.Array(dt.int64)

Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ def create_binary_op(`
`212`	`212`	`len_op = create_unary_op(`
`213`	`213`	`name="len",`
`214`	`214`	`type_signature=op_typing.FixedOutputType(`
`215`		`- dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like"`
	`215`	`+ dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"`
`216`	`216`	`),`
`217`	`217`	`)`
`218`	`218`	`reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM)`