Skip to content

Commit 89e0de7

Browse files
TrevorBergeronGenesis929
authored andcommitted
feat: add DataFrame.eval and DataFrame.query (#361)
* feat: add DataFrame.eval, DataFrame.query * address pr comments * add docstring, disable new tests for legacy pandas * vendor the pandas eval implementation * amend eval docstring * fix doctest expectation * amend doctest * pr comments * Fix doctest for eval
1 parent d5fa4f1 commit 89e0de7

File tree

16 files changed

+3137
-0
lines changed

16 files changed

+3137
-0
lines changed

bigframes/core/eval.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import dataclasses
16+
from typing import Optional
17+
18+
import bigframes_vendored.pandas.core.computation.eval as vendored_pandas_eval
19+
import bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing
20+
21+
import bigframes.dataframe as dataframe
22+
import bigframes.dtypes
23+
import bigframes.series as series
24+
25+
26+
def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]):
27+
"""
28+
Evaluate the given python expression
29+
30+
Args:
31+
df (DataFrame):
32+
Columns of this dataframe will be used to resolve variables in expression.
33+
expr (str):
34+
One or more python expression to evaluate.
35+
target (DataFrame or None):
36+
The evaluation result will be written to the target if provided.
37+
38+
Returns:
39+
Result of evaluation.
40+
"""
41+
index_resolver = {
42+
vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(
43+
df.index.get_level_values(level).to_series()
44+
)
45+
for level, name in enumerate(df.index.names)
46+
}
47+
column_resolver = {
48+
vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series)
49+
for name, series in df.items()
50+
}
51+
# 3 Levels: user -> logging wrapper -> dataframe -> eval helper (this)
52+
return vendored_pandas_eval.eval(
53+
expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore
54+
)
55+
56+
57+
@dataclasses.dataclass
58+
class FakeNumpyArray:
59+
dtype: bigframes.dtypes.Dtype
60+
61+
62+
class EvalSeries(series.Series):
63+
"""Slight modified series that works better with pandas.eval"""
64+
65+
def __init__(self, underlying: series.Series):
66+
super().__init__(data=underlying._block)
67+
68+
@property
69+
def values(self):
70+
"""Returns fake numpy array with only dtype property so that eval can determine schema without actually downloading the data."""
71+
return FakeNumpyArray(self.dtype)

bigframes/dataframe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,6 +1493,17 @@ def sort_values(
14931493
)
14941494
return DataFrame(self._block.order_by(ordering))
14951495

1496+
def eval(self, expr: str) -> DataFrame:
1497+
import bigframes.core.eval as bf_eval
1498+
1499+
return bf_eval.eval(self, expr, target=self)
1500+
1501+
def query(self, expr: str) -> DataFrame:
1502+
import bigframes.core.eval as bf_eval
1503+
1504+
eval_result = bf_eval.eval(self, expr, target=None)
1505+
return self[eval_result]
1506+
14961507
def value_counts(
14971508
self,
14981509
subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None,

tests/system/small/test_dataframe.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,6 +3850,44 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index):
38503850
assert bf_result == pd_result
38513851

38523852

3853+
@skip_legacy_pandas
3854+
@pytest.mark.parametrize(
3855+
("expr",),
3856+
[
3857+
("new_col = int64_col + int64_too",),
3858+
("new_col = (rowindex > 3) | bool_col",),
3859+
("int64_too = bool_col\nnew_col2 = rowindex",),
3860+
],
3861+
)
3862+
def test_df_eval(scalars_dfs, expr):
3863+
scalars_df, scalars_pandas_df = scalars_dfs
3864+
3865+
bf_result = scalars_df.eval(expr).to_pandas()
3866+
pd_result = scalars_pandas_df.eval(expr)
3867+
3868+
pd.testing.assert_frame_equal(bf_result, pd_result)
3869+
3870+
3871+
@skip_legacy_pandas
3872+
@pytest.mark.parametrize(
3873+
("expr",),
3874+
[
3875+
("int64_col > int64_too",),
3876+
("bool_col",),
3877+
("((int64_col - int64_too) % @local_var) == 0",),
3878+
],
3879+
)
3880+
def test_df_query(scalars_dfs, expr):
3881+
# local_var is referenced in expressions
3882+
local_var = 3 # NOQA
3883+
scalars_df, scalars_pandas_df = scalars_dfs
3884+
3885+
bf_result = scalars_df.query(expr).to_pandas()
3886+
pd_result = scalars_pandas_df.query(expr)
3887+
3888+
pd.testing.assert_frame_equal(bf_result, pd_result)
3889+
3890+
38533891
@pytest.mark.parametrize(
38543892
("subset", "normalize", "ascending", "dropna"),
38553893
[

third_party/bigframes_vendored/pandas/core/common.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from typing import Callable, TYPE_CHECKING
55

6+
from bigframes_vendored.pandas.core.dtypes.inference import iterable_not_string
7+
68
if TYPE_CHECKING:
79
from bigframes_vendored.pandas.pandas._typing import T
810

@@ -40,3 +42,27 @@ def pipe(
4042
return func(*args, **kwargs)
4143
else:
4244
return func(obj, *args, **kwargs)
45+
46+
47+
def flatten(line):
48+
"""
49+
Flatten an arbitrarily nested sequence.
50+
51+
Parameters
52+
----------
53+
line : sequence
54+
The non string sequence to flatten
55+
56+
Notes
57+
-----
58+
This doesn't consider strings sequences.
59+
60+
Returns
61+
-------
62+
flattened : generator
63+
"""
64+
for element in line:
65+
if iterable_not_string(element):
66+
yield from flatten(element)
67+
else:
68+
yield element
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/align.py
2+
"""
3+
Core eval alignment algorithms.
4+
"""
5+
from __future__ import annotations
6+
7+
from functools import partial, wraps
8+
from typing import Callable, TYPE_CHECKING
9+
import warnings
10+
11+
import bigframes_vendored.pandas.core.common as com
12+
from bigframes_vendored.pandas.core.computation.common import result_type_many
13+
from bigframes_vendored.pandas.util._exceptions import find_stack_level
14+
import numpy as np
15+
from pandas.errors import PerformanceWarning
16+
17+
if TYPE_CHECKING:
18+
from collections.abc import Sequence
19+
20+
from bigframes_vendored.pandas.core.generic import NDFrame
21+
from bigframes_vendored.pandas.core.indexes.base import Index
22+
from pandas._typing import F
23+
24+
25+
def _align_core_single_unary_op(
26+
term,
27+
) -> tuple[partial | type[NDFrame], dict[str, Index] | None]:
28+
typ: partial | type[NDFrame]
29+
axes: dict[str, Index] | None = None
30+
31+
if isinstance(term.value, np.ndarray):
32+
typ = partial(np.asanyarray, dtype=term.value.dtype)
33+
else:
34+
typ = type(term.value)
35+
if hasattr(term.value, "axes"):
36+
axes = _zip_axes_from_type(typ, term.value.axes)
37+
38+
return typ, axes
39+
40+
41+
def _zip_axes_from_type(
42+
typ: type[NDFrame], new_axes: Sequence[Index]
43+
) -> dict[str, Index]:
44+
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
45+
46+
47+
def _any_pandas_objects(terms) -> bool:
48+
"""
49+
Check a sequence of terms for instances of PandasObject.
50+
"""
51+
return any(is_pandas_object(term.value) for term in terms)
52+
53+
54+
def _filter_special_cases(f) -> Callable[[F], F]:
55+
@wraps(f)
56+
def wrapper(terms):
57+
# single unary operand
58+
if len(terms) == 1:
59+
return _align_core_single_unary_op(terms[0])
60+
61+
term_values = (term.value for term in terms)
62+
63+
# we don't have any pandas objects
64+
if not _any_pandas_objects(terms):
65+
return result_type_many(*term_values), None
66+
67+
return f(terms)
68+
69+
return wrapper
70+
71+
72+
@_filter_special_cases
73+
def _align_core(terms):
74+
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
75+
term_dims = [terms[i].value.ndim for i in term_index]
76+
77+
from pandas import Series
78+
79+
ndims = Series(dict(zip(term_index, term_dims)))
80+
81+
# initial axes are the axes of the largest-axis'd term
82+
biggest = terms[ndims.idxmax()].value
83+
typ = biggest._constructor
84+
axes = biggest.axes
85+
naxes = len(axes)
86+
gt_than_one_axis = naxes > 1
87+
88+
for value in (terms[i].value for i in term_index):
89+
value_is_series = is_series(value)
90+
is_series_and_gt_one_axis = value_is_series and gt_than_one_axis
91+
92+
for axis, items in enumerate(value.axes):
93+
if is_series_and_gt_one_axis:
94+
ax, itm = naxes - 1, value.index
95+
else:
96+
ax, itm = axis, items
97+
98+
if not axes[ax].is_(itm):
99+
axes[ax] = axes[ax].join(itm, how="outer")
100+
101+
for i, ndim in ndims.items():
102+
for axis, items in zip(range(ndim), axes):
103+
ti = terms[i].value
104+
105+
if hasattr(ti, "reindex"):
106+
transpose = value_is_series(ti) and naxes > 1
107+
reindexer = axes[naxes - 1] if transpose else items
108+
109+
term_axis_size = len(ti.axes[axis])
110+
reindexer_size = len(reindexer)
111+
112+
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
113+
if ordm >= 1 and reindexer_size >= 10000:
114+
w = (
115+
f"Alignment difference on axis {axis} is larger "
116+
f"than an order of magnitude on term {repr(terms[i].name)}, "
117+
f"by more than {ordm:.4g}; performance may suffer."
118+
)
119+
warnings.warn(
120+
w, category=PerformanceWarning, stacklevel=find_stack_level()
121+
)
122+
123+
obj = ti.reindex(reindexer, axis=axis, copy=False)
124+
terms[i].update(obj)
125+
126+
terms[i].update(terms[i].value.values)
127+
128+
return typ, _zip_axes_from_type(typ, axes)
129+
130+
131+
def align_terms(terms):
132+
"""
133+
Align a set of terms.
134+
"""
135+
try:
136+
# flatten the parse tree (a nested list, really)
137+
terms = list(com.flatten(terms))
138+
except TypeError:
139+
# can't iterate so it must just be a constant or single variable
140+
if is_series_or_dataframe(terms.value):
141+
typ = type(terms.value)
142+
return typ, _zip_axes_from_type(typ, terms.value.axes)
143+
return np.result_type(terms.type), None
144+
145+
# if all resolved variables are numeric scalars
146+
if all(term.is_scalar for term in terms):
147+
return result_type_many(*(term.value for term in terms)).type, None
148+
149+
# perform the main alignment
150+
typ, axes = _align_core(terms)
151+
return typ, axes
152+
153+
154+
def reconstruct_object(typ, obj, axes, dtype):
155+
"""
156+
Reconstruct an object given its type, raw value, and possibly empty
157+
(None) axes.
158+
159+
Parameters
160+
----------
161+
typ : object
162+
A type
163+
obj : object
164+
The value to use in the type constructor
165+
axes : dict
166+
The axes to use to construct the resulting pandas object
167+
168+
Returns
169+
-------
170+
ret : typ
171+
An object of type ``typ`` with the value `obj` and possible axes
172+
`axes`.
173+
"""
174+
try:
175+
typ = typ.type
176+
except AttributeError:
177+
pass
178+
179+
res_t = np.result_type(obj.dtype, dtype)
180+
181+
if not isinstance(typ, partial) and is_pandas_type(typ):
182+
return typ(obj, dtype=res_t, **axes)
183+
184+
# special case for pathological things like ~True/~False
185+
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
186+
ret_value = res_t.type(obj)
187+
else:
188+
ret_value = typ(obj).astype(res_t)
189+
# The condition is to distinguish 0-dim array (returned in case of
190+
# scalar) and 1 element array
191+
# e.g. np.array(0) and np.array([0])
192+
if (
193+
len(obj.shape) == 1
194+
and len(obj) == 1
195+
and not isinstance(ret_value, np.ndarray)
196+
):
197+
ret_value = np.array([ret_value]).astype(res_t)
198+
199+
return ret_value
200+
201+
202+
# Custom to recognize BigFrames types
203+
def is_series(obj) -> bool:
204+
from bigframes_vendored.pandas.core.series import Series
205+
206+
return isinstance(obj, Series)
207+
208+
209+
def is_series_or_dataframe(obj) -> bool:
210+
from bigframes_vendored.pandas.core.frame import NDFrame
211+
212+
return isinstance(obj, NDFrame)
213+
214+
215+
def is_pandas_object(obj) -> bool:
216+
from bigframes_vendored.pandas.core.frame import NDFrame
217+
from bigframes_vendored.pandas.core.indexes.base import Index
218+
219+
return isinstance(obj, NDFrame) or isinstance(obj, Index)
220+
221+
222+
def is_pandas_type(type) -> bool:
223+
from bigframes_vendored.pandas.core.frame import NDFrame
224+
from bigframes_vendored.pandas.core.indexes.base import Index
225+
226+
return issubclass(type, NDFrame) or issubclass(type, Index)

0 commit comments

Comments
 (0)