diff --git a/LICENSES/PANDAS_PLY_LICENSE b/LICENSES/PANDAS_PLY_LICENSE new file mode 100644 index 0000000000000..eac7ca890d560 --- /dev/null +++ b/LICENSES/PANDAS_PLY_LICENSE @@ -0,0 +1,13 @@ +Copyright 2015 Coursera Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index d4d8b7e4e9747..10a90488020b6 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -81,7 +81,7 @@ class TestPDApi(Base, tm.TestCase): 'pivot', 'pivot_table', 'plot_params', 'qcut', 'scatter_matrix', 'show_versions', 'timedelta_range', 'unique', - 'value_counts', 'wide_to_long'] + 'value_counts', 'wide_to_long', 'X'] # top-level option funcs funcs_option = ['reset_option', 'describe_option', 'get_option', diff --git a/pandas/computation/api.py b/pandas/computation/api.py index e5814e08c4bbe..a821a579ab6f0 100644 --- a/pandas/computation/api.py +++ b/pandas/computation/api.py @@ -2,3 +2,4 @@ from pandas.computation.eval import eval from pandas.computation.expr import Expr +from pandas.computation.delayed import X diff --git a/pandas/computation/delayed.py b/pandas/computation/delayed.py new file mode 100644 index 0000000000000..2640f04727d0e --- /dev/null +++ b/pandas/computation/delayed.py @@ -0,0 +1,333 @@ +from __future__ import print_function +from pandas.compat import iteritems +from pandas import Series, DataFrame + +""" +Delayed (rename?) + +delayed selection api through magic `X` variable +""" + +# includes large portions of pandas_ply, see LICENSES + +_error_doc = """ +pandas `X` is a deferred object that cannot be passed into +most functions. {case} which is invalid. +To pass a deferred Series into a function, use the .pipe +function, for example, X.a.pipe(np.log), instead np.log(X.a) """ + +# numpy / pandas access +_disallow_attr = [ + '__array_struct__', '__array_interface__', + '_typ', '_data', 'columns', 'values', +] + +_allowed_magic_methods = [ + '__add__', '__div__', '__sub__', '__truediv__', '__mul__', + '__radd__', '__rdiv__', '__rsub__', '__rtruediv__', '__rmul__', + '__mod__', '__rmod__', + '__eq__', '__ge__', '__gt__', '__lt__', '__le__', '__ne__', + '__and__', '__or__', '__invert__' '__neg__', '__pos__', + '__rand__', '__ror__', + '__abs__', '__pow__', +] +# can leave most not implemented, but __iter__ is +# needed, otherwise __getitem__ sastifies sequence protocol +# and may iterate forever +_blacklisted_magic_methods = [ + '__iter__' +] + +_accessors = [ + 'cat', 'dt', 'str' +] + + +class Expression(object): + """ + Expression is the (abstract) base class for symbolic expressions. + Symbolic expressions are encoded representations of Python expressions, + kept on ice until you are ready to evaluate them. + + If an expression is complete, it will act as a 1-argument + function, taking a DataFrame whose context to evaluate + the expr in. If not complete, __call__ will create + a symbolic call node. + """ + + def _eval(self, context, **options): + """Evaluate a symbolic expression. + + Args: + context: The context object for evaluation. Currently, this is a + dictionary mapping symbol names to values, + `**options`: Options for evaluation. Currently, the only option is + `log`, which results in some debug output during evaluation if + it is set to `True`. + + Returns: + anything + """ + raise NotImplementedError + + def __repr__(self): + raise NotImplementedError + + def __getattr__(self, name): + """Construct a symbolic representation of `getattr(self, name)`.""" + if name in _disallow_attr: + msg = "The {0} attribuate was called on the object".format(name) + raise TypeError(_error_doc.format(case=msg)) + + # generally completeness alternates, for instance + # in the following expression, marking + # incomplete [i], complete [c] + # X . a . pipe(np.exp) . sum() + # [i] [c] [i] [c] [i][c] + + # but, accessors may break this pattern, so + # as a special case do some introspection + # to check if what's being asked for is callable + if self._name in _accessors: + # cleaner way to do this? + acc = getattr(getattr(Series, self._name), name) + complete = True + if hasattr(acc, '__call__'): + complete = False + else: + complete = not self._complete + return GetAttr(self, name, complete) + + def __getitem__(self, name): + return GetItem(self, name, complete=True) + + def __call__(self, *args, **kwargs): + if self._complete: + # selection lambda passed to pandas + if len(args) != 1: + msg = ("too many values passed into `X`, selection " + "likely malformed") + raise ValueError(msg) + df, = args + if not isinstance(df, DataFrame): + msg = ("`X` selection can only be evaluated in the context " + "of a DataFrame ") + raise ValueError(msg) + return self._eval({0: df}) + # symbolic call + return Call(self, args=args, kwargs=kwargs) + + # error trapping + def __array__(self, *args, **kwargs): + msg = "The object was attempted to be converted to a numpy array" + raise TypeError(_error_doc.format(case=msg)) + + +def _get_sym_magic_method(name): + def magic_method(self, *args, **kwargs): + return Call(GetAttr(self, name), args, kwargs) + return magic_method + + +def _get_blacklisted_method(name): + def blacklisted_method(self, *args, **kwargs): + msg = "The {0} method was called".format(name) + raise TypeError(_error_doc.format(case=msg)) + return blacklisted_method + +for name in _allowed_magic_methods: + setattr(Expression, name, _get_sym_magic_method(name)) +for name in _blacklisted_magic_methods: + setattr(Expression, name, _get_blacklisted_method(name)) + + +class Symbol(Expression): + """`Symbol(name)` is an atomic symbolic expression, labelled with an + arbitrary `name`.""" + + def __init__(self, name, complete=False): + self._name = name + self._complete = complete + + def _eval(self, context, **options): + if options.get('log'): + print('Symbol._eval', repr(self)) + result = context[self._name] + if options.get('log'): + print('Returning', repr(self), '=>', repr(result)) + return result + + def __repr__(self): + return 'Symbol(%s)' % repr(self._name) + + +class GetAttr(Expression): + """`GetItem(obj, name)` is a symbolic expression representing the result of + `getattr(obj, name)`. (`obj` and `name` can themselves be symbolic.)""" + + def __init__(self, obj, name, complete=True): + self._obj = obj + self._name = name + self._complete = complete + + def _eval(self, context, **options): + if options.get('log'): + print('GetAttr._eval', repr(self)) + evaled_obj = eval_if_symbolic(self._obj, context, **options) + result = getattr(evaled_obj, self._name) + if options.get('log'): + print('Returning', repr(self), '=>', repr(result)) + return result + + def __repr__(self): + return 'getattr(%s, %s)' % (repr(self._obj), repr(self._name)) + + +class GetItem(Expression): + """`GetAttr(obj, name)` is a symbolic expression representing the result of + `getattr(obj, name)`. (`obj` and `name` can themselves be symbolic.)""" + + def __init__(self, obj, name, complete=True): + self._obj = obj + self._name = name + self._complete = complete + + def _eval(self, context, **options): + if options.get('log'): + print('GetItem._eval', repr(self)) + evaled_obj = eval_if_symbolic(self._obj, context, **options) + result = evaled_obj[self._name] + if options.get('log'): + print('Returning', repr(self), '=>', repr(result)) + return result + + def __repr__(self): + return 'getitem(%s, %s)' % (repr(self._obj), repr(self._name)) + + +class Call(Expression): + """`Call(func, args, kwargs)` is a symbolic expression representing the + result of `func(*args, **kwargs)`. (`func`, each member of the `args` + iterable, and each value in the `kwargs` dictionary can themselves be + symbolic).""" + + def __init__(self, func, args=None, kwargs=None, complete=True): + self._func = func + if not args: + args = [] + if not kwargs: + kwargs = {} + self._args = args + self._kwargs = kwargs + self._complete = True + self._name = None + + def _eval(self, context, **options): + if options.get('log'): + print('Call._eval', repr(self)) + evaled_func = eval_if_symbolic(self._func, context, **options) + evaled_args = [eval_if_symbolic(v, context, **options) + for v in self._args] + evaled_kwargs = dict((k, eval_if_symbolic(v, context, **options)) + for k, v in iteritems(self._kwargs)) + result = evaled_func(*evaled_args, **evaled_kwargs) + if options.get('log'): + print('Returning', repr(self), '=>', repr(result)) + return result + + def __repr__(self): + return '{func}(*{args}, **{kwargs})'.format( + func=repr(self._func), + args=repr(self._args), + kwargs=repr(self._kwargs)) + + +def eval_if_symbolic(obj, context, **options): + """Evaluate an object if it is a symbolic expression, or otherwise just + returns it back. + + Args: + obj: Either a symbolic expression, or anything else (in which case this + is a noop). + context: Passed as an argument to `obj._eval` if `obj` is symbolic. + `**options`: Passed as arguments to `obj._eval` if `obj` is symbolic. + + Returns: + anything + + Examples: + >>> eval_if_symbolic(Symbol('x'), {'x': 10}) + 10 + >>> eval_if_symbolic(7, {'x': 10}) + 7 + """ + return obj._eval(context, **options) if hasattr(obj, '_eval') else obj + + +def to_callable(obj): + """Turn an object into a callable. + + Args: + obj: This can be + + * **a symbolic expression**, in which case the output callable + evaluates the expression with symbols taking values from the + callable's arguments (listed arguments named according to their + numerical index, keyword arguments named according to their + string keys), + * **a callable**, in which case the output callable is just the + input object, or + * **anything else**, in which case the output callable is a + constant function which always returns the input object. + + Returns: + callable + + Examples: + >>> to_callable(Symbol(0) + Symbol('x'))(3, x=4) + 7 + >>> to_callable(lambda x: x + 1)(10) + 11 + >>> to_callable(12)(3, x=4) + 12 + """ + if hasattr(obj, '_eval'): + return lambda *args, **kwargs: obj._eval( + dict(enumerate(args), **kwargs)) + elif callable(obj): + return obj + else: + return lambda *args, **kwargs: obj + + +# keep? +def sym_call(func, *args, **kwargs): + """Construct a symbolic representation of `func(*args, **kwargs)`. + + This is necessary because `func(symbolic)` will not (ordinarily) know to + construct a symbolic expression when it receives the symbolic + expression `symbolic` as a parameter (if `func` is not itself symbolic). + So instead, we write `sym_call(func, symbolic)`. + + Tip: If the main argument of the function is a (symbolic) DataFrame, then + pandas' `pipe` method takes care of this problem without `sym_call`. For + instance, while `np.sqrt(X)` won't work, `X.pipe(np.sqrt)` will. + + Args: + func: Function to call on evaluation (can be symbolic). + `*args`: Arguments to provide to `func` on evaluation (can be symbolic). + `**kwargs`: Keyword arguments to provide to `func` on evaluation (can be + symbolic). + + Returns: + `ply.symbolic.Expression` + + Example: + >>> sym_call(math.sqrt, Symbol('x'))._eval({'x': 16}) + 4 + """ + + return Call(func, args=args, kwargs=kwargs) + +X = Symbol(0) +"""A Symbol for "the first argument" (for convenience).""" diff --git a/pandas/computation/tests/test_delayed.py b/pandas/computation/tests/test_delayed.py new file mode 100644 index 0000000000000..54c6d2bbddd10 --- /dev/null +++ b/pandas/computation/tests/test_delayed.py @@ -0,0 +1,21 @@ +import pandas as pd +from pandas import X + +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal + + +class TestDelayedApi(tm.TestCase): + def test_basics(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [1.5, 2.5, 3.4], + 'c': ['abc', 'def', 'efg'], + 'd': pd.to_datetime(['2014-01-01', + '2014-01-02', '2014-01-03'])}) + assert_frame_equal(df[df['a'] > 1], df[X.a > 1]) + assert_frame_equal(df[df['a'] == 1], df[X.a == 1]) + assert_frame_equal(df.assign(e=lambda x: x['b'] + 1), + df.assign(e=X.b + 1)) + assert_frame_equal(df.assign(e=lambda x: x['d'].dt.day), + df.assign(e=X.d.dt.day)) + assert_frame_equal(df.assign(e=lambda x: x['c'].str.upper()), + df.assign(e=X.c.str.upper()))