diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 760e6a614fd92..96f9fd912b664 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -269,7 +269,6 @@ A ``where`` operation for getting. df[df > 0] - Setting ~~~~~~~ @@ -708,3 +707,20 @@ Reading from an excel file :suppress: os.remove('foo.xlsx') + +Gotchas +------- + +If you are trying an operation and you see an exception like: + +.. code-block:: python + + >>> if pd.Series([False, True, False]): + print("I was true") + Traceback + ... + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + +See :ref:`Comparisons` for an explanation and what to do. + +See :ref:`Gotachas` as well. diff --git a/doc/source/basics.rst b/doc/source/basics.rst index a0818831fb988..b75c65ca727f4 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -8,7 +8,7 @@ from pandas import * randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - from pandas.compat import lrange + from pandas.compat import lrange ============================== Essential Basic Functionality @@ -198,6 +198,9 @@ replace NaN with some other value using ``fillna`` if you wish). Flexible Comparisons ~~~~~~~~~~~~~~~~~~~~ + +.. _basics.compare: + Starting in v0.8, pandas introduced binary comparison methods eq, ne, lt, gt, le, and ge to Series and DataFrame whose behavior is analogous to the binary arithmetic operations described above: @@ -205,9 +208,52 @@ arithmetic operations described above: .. ipython:: python df.gt(df2) - df2.ne(df) +These operations produce a pandas object the same type as the left-hand-side input +that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations, +see :ref:`here` + +Furthermore, you can apply the reduction functions: ``any()`` and ``all()`` to provide a +way to summarize these results. + +.. ipython:: python + + (df>0).all() + (df>0).any() + +Finally you can test if a pandas object is empty, via the ``empty`` property. + +.. ipython:: python + + df.empty + DataFrame(columns=list('ABC')).empty + +.. warning:: + + You might be tempted to do the following: + + .. code-block:: python + + >>>if df: + ... + + Or + + .. code-block:: python + + >>> df and df2 + + These both will raise as you are trying to compare multiple values. + + .. code-block:: python + + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + + +See :ref:`gotchas` for a more detailed discussion. + + Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 003169839f029..cf3f1be59ac45 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -15,6 +15,58 @@ Caveats and Gotchas ******************* +Using If/Truth Statements with Pandas +------------------------------------- + +.. _gotchas.truth: + +Pandas follows the numpy convention of raising an error when you try to convert something to a ``bool``. +This happens in a ``if`` or when using the boolean operations, ``and``, ``or``, or ``not``. It is not clear +what the result of + +.. code-block:: python + + >>> if Series([False, True, False]): + ... + +should be. Should it be ``True`` because it's not zero-length? ``False`` because there are ``False`` values? +It is unclear, so instead, pandas raises a ``ValueError``: + +.. code-block:: python + + >>> if pd.Series([False, True, False]): + print("I was true") + Traceback + ... + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + + +If you see that, you need to explicitly choose what you want to do with it (e.g., use `any()`, `all()` or `empty`). +or, you might want to compare if the pandas object is ``None`` + +.. code-block:: python + + >>> if pd.Series([False, True, False]) is not None: + print("I was not None") + >>> I was not None + +Bitwise boolean +~~~~~~~~~~~~~~~ + +Bitwise boolean operators like ``==`` and ``!=`` will return a boolean ``Series``, +which is almost always what you want anyways. + +.. code-block:: python + + >>> s = pd.Series(range(5)) + >>> s == 4 + 0 False + 1 False + 2 False + 3 False + 4 True + dtype: bool + ``NaN``, Integer ``NA`` values and ``NA`` type promotions --------------------------------------------------------- @@ -428,7 +480,7 @@ parse HTML tables in the top-level pandas io function ``read_html``. lxml will work correctly: .. code-block:: sh - + # remove the included version conda remove lxml diff --git a/doc/source/release.rst b/doc/source/release.rst index 4e6ac7240512c..ccdfe7320f53d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -130,6 +130,9 @@ pandas 0.13 now returns a ``MultiIndex`` rather than an ``Index``. (:issue:`4039`) - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`) + - Factored out excel_value_to_python_value from ExcelFile::_parse_excel (:issue:`4589`) + - ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, this reverts back to (:issue:`1073`, :issue:`4633`) + behavior. **Internal Refactoring** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 022799cd88014..6f34617495c29 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -118,6 +118,18 @@ API changes index.set_names(["bob", "cranberry"], inplace=True) - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`) + - ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, this reverts back to (:issue:`1073`, :issue:`4633`) + behavior. + + This prevent behaviors like (which will now all raise ``ValueError``) + + ..code-block :: + + if df: + .... + + df1 and df2 + s1 and s2 Enhancements ~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d15ce05e84d40..0fecddbd4f617 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -531,7 +531,8 @@ def empty(self): return not all(len(self._get_axis(a)) > 0 for a in self._AXIS_ORDERS) def __nonzero__(self): - return not self.empty + raise ValueError("The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().") + __bool__ = __nonzero__ #---------------------------------------------------------------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f0ba0c3b54f4a..723ff2fd5ab56 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2101,9 +2101,22 @@ def filter(self, func, dropna=True, *args, **kwargs): else: res = path(group) - if res: + def add_indexer(): indexers.append(self.obj.index.get_indexer(group.index)) + # interpret the result of the filter + if isinstance(res,(bool,np.bool_)): + if res: + add_indexer() + else: + if getattr(res,'ndim',None) == 1: + if res.ravel()[0]: + add_indexer() + else: + + # in theory you could do .all() on the boolean result ? + raise TypeError("the filter must return a boolean result") + if len(indexers) == 0: filtered = self.obj.take([]) # because np.concatenate would fail else: diff --git a/pandas/core/series.py b/pandas/core/series.py index e3f97a28fe8cc..3a8c0ec5e1a0f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -798,13 +798,6 @@ def __contains__(self, key): __long__ = _coerce_method(int) __int__ = _coerce_method(int) - def __nonzero__(self): - # special case of a single element bool series degenerating to a scalar - if self.dtype == np.bool_ and len(self) == 1: - return bool(self.iloc[0]) - return not self.empty - __bool__ = __nonzero__ - # we are preserving name here def __getstate__(self): return dict(_data=self._data, name=self.name) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e2d9235510f83..b5aaf93831a15 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1519,11 +1519,11 @@ def test_table_values_dtypes_roundtrip(self): with ensure_clean(self.path) as store: df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') store.append('df_f8', df1) - assert df1.dtypes == store['df_f8'].dtypes + assert_series_equal(df1.dtypes,store['df_f8'].dtypes) df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') store.append('df_i8', df2) - assert df2.dtypes == store['df_i8'].dtypes + assert_series_equal(df2.dtypes,store['df_i8'].dtypes) # incompatible dtype self.assertRaises(ValueError, store.append, 'df_i8', df1) @@ -1531,7 +1531,7 @@ def test_table_values_dtypes_roundtrip(self): # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) store.append('df_f4', df1) - assert df1.dtypes == store['df_f4'].dtypes + assert_series_equal(df1.dtypes,store['df_f4'].dtypes) assert df1.dtypes[0] == 'float32' # check with mixed dtypes diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e31aa5b3f5983..41b705d22b85d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10607,13 +10607,10 @@ def test_index_namedtuple(self): df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1) - def test_bool_empty_nonzero(self): + def test_empty_nonzero(self): df = DataFrame([1, 2, 3]) - self.assertTrue(bool(df)) self.assertFalse(df.empty) df = DataFrame(index=['a', 'b'], columns=['c', 'd']).dropna() - self.assertFalse(bool(df)) - self.assertFalse(bool(df.T)) self.assertTrue(df.empty) self.assertTrue(df.T.empty) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index f388c5d72627c..6ea58ec997e23 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -73,7 +73,6 @@ def _construct(self, shape, value=None, **kwargs): arr = np.random.randn(*shape) return self._typ(arr,**kwargs) - def _compare(self, result, expected): self._comparator(result,expected) @@ -82,14 +81,14 @@ def test_rename(self): # single axis for axis in self._axes(): kwargs = { axis : list('ABCD') } - o = self._construct(4,**kwargs) + obj = self._construct(4,**kwargs) # no values passed #self.assertRaises(Exception, o.rename(str.lower)) # rename a single axis - result = o.rename(**{ axis : str.lower }) - expected = o.copy() + result = obj.rename(**{ axis : str.lower }) + expected = obj.copy() setattr(expected,axis,list('abcd')) self._compare(result, expected) @@ -119,6 +118,41 @@ def test_get_numeric_data(self): self._compare(result, o) # _get_numeric_data is includes _get_bool_data, so can't test for non-inclusion + def test_nonzero(self): + + # GH 4633 + # look at the boolean/nonzero behavior for objects + obj = self._construct(shape=4) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + obj = self._construct(shape=4,value=1) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + obj = self._construct(shape=4,value=np.nan) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + # empty + obj = self._construct(shape=0) + self.assertRaises(ValueError, lambda : bool(obj)) + + # invalid behaviors + + obj1 = self._construct(shape=4,value=1) + obj2 = self._construct(shape=4,value=1) + + def f(): + if obj1: + print("this works and shouldn't") + self.assertRaises(ValueError, f) + self.assertRaises(ValueError, lambda : obj1 and obj2) + self.assertRaises(ValueError, lambda : obj1 or obj2) + self.assertRaises(ValueError, lambda : not obj1) class TestSeries(unittest.TestCase, Generic): _typ = Series @@ -154,6 +188,14 @@ def test_get_numeric_data_preserve_dtype(self): expected = Series([],dtype='M8[ns]') self._compare(result, expected) + def test_nonzero_single_element(self): + + s = Series([True]) + self.assertRaises(ValueError, lambda : bool(s)) + + s = Series([False]) + self.assertRaises(ValueError, lambda : bool(s)) + class TestDataFrame(unittest.TestCase, Generic): _typ = DataFrame _comparator = lambda self, x, y: assert_frame_equal(x,y) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1f7570b7f7887..b2849aeb2fbe8 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -296,12 +296,6 @@ def test_scalar_conversion(self): self.assert_(int(Series([1.])) == 1) self.assert_(long(Series([1.])) == 1) - self.assert_(bool(Series([True])) == True) - self.assert_(bool(Series([False])) == False) - - self.assert_(bool(Series([True,True])) == True) - self.assert_(bool(Series([False,True])) == True) - def test_astype(self): s = Series(np.random.randn(5),name='foo') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 172172f667eca..01f573279fe5c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -256,7 +256,7 @@ def test_indexing(self): df = DataFrame(randn(5,5),columns=['open','high','low','close','volume'],index=date_range('2012-01-02 18:01:00',periods=5,tz='US/Central',freq='s')) expected = df.loc[[df.index[2]]] result = df['2012-01-02 18:01:02'] - self.assert_(result == expected) + assert_frame_equal(result,expected) # this is a single date, so will raise self.assertRaises(KeyError, df.__getitem__, df.index[2],)