From 7ed887ef2104e05b3065749006659b3c579aafd8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 May 2018 22:00:31 -0500 Subject: [PATCH 001/116] DOC: Added 0.23.1 whatsnew template (#21001) --- doc/source/whatsnew/v0.23.1.txt | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.1.txt diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt new file mode 100644 index 0000000000000..5c9c3e2931bd9 --- /dev/null +++ b/doc/source/whatsnew/v0.23.1.txt @@ -0,0 +1,82 @@ +.. _whatsnew_0231: + +v0.23.1 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.23.1 + :local: + :backlinks: none + +.. _whatsnew_0231.enhancements: + +New features +~~~~~~~~~~~~ + + +.. _whatsnew_0231.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0231.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0231.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Reshaping +^^^^^^^^^ + +- +- + +Categorical +^^^^^^^^^^^ + +- From 4b30521677a2636daa880df8ad5dfcd1f77e0296 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Wed, 16 May 2018 11:02:00 +0800 Subject: [PATCH 002/116] DOC: Correct the date of whatsnew v0.23 #21067 (#21069) (cherry picked from commit d63d0152f148bafc82b0af5175a5f1d10700991a) --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3f89de1dc22d8..feba9d856789b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1,6 +1,6 @@ .. _whatsnew_0230: -v0.23.0 (May 15, 2017) +v0.23.0 (May 15, 2018) ---------------------- This is a major release from 0.22.0 and includes a number of API changes, From 248fae2b76468f77d070962da24ea60c08aece10 Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Wed, 16 May 2018 20:20:42 -0400 Subject: [PATCH 003/116] DOC: updated docstring for nanoseconds function per doc guidelines (#21065) (cherry picked from commit 9f40757c9c8e8cc5df4984599f7047daff6685ae) --- pandas/_libs/tslibs/timedeltas.pyx | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d17d4e7139d72..f7bb6c1dbb304 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -791,9 +791,32 @@ cdef class _Timedelta(timedelta): @property def nanoseconds(self): """ - Number of nanoseconds (>= 0 and less than 1 microsecond). + Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. + + Returns + ------- + int + Number of nanoseconds. + + See Also + -------- + Timedelta.components : Return all attributes with assigned values + (i.e. days, hours, minutes, seconds, milliseconds, microseconds, + nanoseconds). + + Examples + -------- + **Using string input** + + >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') + >>> td.nanoseconds + 42 + + **Using integer input** - .components will return the shown components + >>> td = pd.Timedelta(42, unit='ns') + >>> td.nanoseconds + 42 """ self._ensure_components() return self._ns From e46940025319114fd4a8f55635e3988072381ed0 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 17 May 2018 01:21:51 +0100 Subject: [PATCH 004/116] improved performance of CategoricalIndex.is_monotonic* (#21025) (cherry picked from commit 1ee5ecf3ddbf1ce92e1a78e90a4ef07fc3cf0840) --- asv_bench/benchmarks/categoricals.py | 20 +++++++++++++++++++ doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/category.py | 4 ++-- pandas/tests/indexes/test_category.py | 28 ++++++++++++++++----------- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index ae1d7029217a4..5464e7cba22c3 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -173,3 +173,23 @@ def setup(self, dtype): def time_isin_categorical(self, dtype): self.series.isin(self.sample) + + +class IsMonotonic(object): + + def setup(self): + N = 1000 + self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.s = pd.Series(self.c) + + def time_categorical_index_is_monotonic_increasing(self): + self.c.is_monotonic_increasing + + def time_categorical_index_is_monotonic_decreasing(self): + self.c.is_monotonic_decreasing + + def time_categorical_series_is_monotonic_increasing(self): + self.s.is_monotonic_increasing + + def time_categorical_series_is_monotonic_decreasing(self): + self.s.is_monotonic_decreasing diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5c9c3e2931bd9..8c5111e712a34 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -29,6 +29,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) - - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3ffef5804acf7..78b7ae7054248 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -382,11 +382,11 @@ def is_unique(self): @property def is_monotonic_increasing(self): - return Index(self.codes).is_monotonic_increasing + return self._engine.is_monotonic_increasing @property def is_monotonic_decreasing(self): - return Index(self.codes).is_monotonic_decreasing + return self._engine.is_monotonic_decreasing @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) def unique(self, level=None): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 6a1a1a5bdba4f..0e630f69b1a32 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -543,35 +543,41 @@ def test_reindex_empty_index(self): tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) - def test_is_monotonic(self): - c = CategoricalIndex([1, 2, 3]) + @pytest.mark.parametrize('data, non_lexsorted_data', [ + [[1, 2, 3], [9, 0, 1, 2, 3]], + [list('abc'), list('fabcd')], + ]) + def test_is_monotonic(self, data, non_lexsorted_data): + c = CategoricalIndex(data) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], ordered=True) + c = CategoricalIndex(data, ordered=True) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], categories=[3, 2, 1]) + c = CategoricalIndex(data, categories=reversed(data)) assert not c.is_monotonic_increasing assert c.is_monotonic_decreasing - c = CategoricalIndex([1, 3, 2], categories=[3, 2, 1]) + c = CategoricalIndex(data, categories=reversed(data), ordered=True) assert not c.is_monotonic_increasing - assert not c.is_monotonic_decreasing + assert c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], categories=[3, 2, 1], ordered=True) + # test when data is neither monotonic increasing nor decreasing + reordered_data = [data[0], data[2], data[1]] + c = CategoricalIndex(reordered_data, categories=reversed(data)) assert not c.is_monotonic_increasing - assert c.is_monotonic_decreasing + assert not c.is_monotonic_decreasing # non lexsorted categories - categories = [9, 0, 1, 2, 3] + categories = non_lexsorted_data - c = CategoricalIndex([9, 0], categories=categories) + c = CategoricalIndex(categories[:2], categories=categories) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([0, 1], categories=categories) + c = CategoricalIndex(categories[1:3], categories=categories) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing From a0893c7f94e4b1d2a8047816614da96e31d95803 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 17 May 2018 08:42:14 -0400 Subject: [PATCH 005/116] BUG: Prevent Unlimited Agg Recursion with Duplicate Col Names (#21066) (cherry picked from commit d623ffd90f57abda2beb34d807da58ca95b3743d) --- doc/source/whatsnew/v0.23.1.txt | 5 ++++- pandas/core/base.py | 6 +++--- pandas/core/frame.py | 11 +++++++++-- pandas/tests/frame/test_apply.py | 8 ++++++++ 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 8c5111e712a34..35d150dc263b8 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -44,7 +44,10 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Conversion diff --git a/pandas/core/base.py b/pandas/core/base.py index fa78c89ed4ee7..aa051c6f5eaef 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -590,9 +590,10 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): # multiples else: - for col in obj: + for index, col in enumerate(obj): try: - colg = self._gotitem(col, ndim=1, subset=obj[col]) + colg = self._gotitem(col, ndim=1, + subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): @@ -675,7 +676,6 @@ def _gotitem(self, key, ndim, subset=None): subset : object, default None subset to act on """ - # create a new object to prevent aliasing if subset is None: subset = self.obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dccc840f5affd..77a67c048a48d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5731,7 +5731,12 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, + key, # type: Union[str, List[str]] + ndim, # type: int + subset=None # type: Union[Series, DataFrame, None] + ): + # type: (...) -> Union[Series, DataFrame] """ sub-classes to define return a sliced object @@ -5746,9 +5751,11 @@ def _gotitem(self, key, ndim, subset=None): """ if subset is None: subset = self + elif subset.ndim == 1: # is Series + return subset # TODO: _shallow_copy(subset)? - return self[key] + return subset[key] _agg_doc = dedent(""" The aggregation operations are always performed over an axis, either the diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ac46f02d00773..dfb2961befe35 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -554,6 +554,14 @@ def test_apply_non_numpy_dtype(self): result = df.apply(lambda x: x) assert_frame_equal(result, df) + def test_apply_dup_names_multi_agg(self): + # GH 21063 + df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'a']) + expected = pd.DataFrame([[0, 1]], columns=['a', 'a'], index=['min']) + result = df.agg(['min']) + + tm.assert_frame_equal(result, expected) + class TestInferOutputShape(object): # the user has supplied an opaque UDF where From f574421a6aa0e3fa2a4da71a31ef628d3329f35d Mon Sep 17 00:00:00 2001 From: Adam Kim <30554376+snowbeta@users.noreply.github.com> Date: Thu, 17 May 2018 16:55:14 -0400 Subject: [PATCH 006/116] BUG: type aliasing is not allowed to be compared using isinstance() (#21098) (cherry picked from commit 6cc5f235b083a3505eb4ca9b18cad1f3eda29f5b) --- doc/source/whatsnew/v0.23.1.txt | 5 +++++ pandas/compat/__init__.py | 2 +- pandas/tests/test_compat.py | 7 ++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 35d150dc263b8..9c19d4d6bbaad 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -48,6 +48,11 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) + +Strings +^^^^^^^ + +- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - Conversion diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12517372fedd1..5ae22694d0da7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -425,7 +425,7 @@ def raise_with_traceback(exc, traceback=Ellipsis): # In Python 3.7, the private re._pattern_type is removed. # Python 3.5+ have typing.re.Pattern -if PY35: +if PY36: import typing re_type = typing.re.Pattern else: diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py index ead9ba1e26e2d..79d3aad493182 100644 --- a/pandas/tests/test_compat.py +++ b/pandas/tests/test_compat.py @@ -4,9 +4,10 @@ """ import pytest +import re from pandas.compat import (range, zip, map, filter, lrange, lzip, lmap, lfilter, builtins, iterkeys, itervalues, iteritems, - next, get_range_parameters, PY2) + next, get_range_parameters, PY2, re_type) class TestBuiltinIterators(object): @@ -89,3 +90,7 @@ def test_get_range_parameters(self, start, stop, step): assert start_result == start_expected assert stop_result == stop_expected assert step_result == step_expected + + +def test_re_type(): + assert isinstance(re.compile(''), re_type) From 14ad1993fc3c28b540e6a87d8f80c9c7ba30eeff Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Fri, 18 May 2018 11:23:45 +0530 Subject: [PATCH 007/116] BUG: make .reset_index() raise when passed an invalid level name (#21016) closes #20925 (cherry picked from commit e033c0616158d3ba974456b4f84810492936b1fe) --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/core/series.py | 7 ++++--- pandas/tests/series/test_alter_axes.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9c19d4d6bbaad..9382d74f95295 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -64,7 +64,7 @@ Conversion Indexing ^^^^^^^^ -- +- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - I/O diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e2ae22f35af7..6d396e845219e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1195,12 +1195,13 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if drop: new_index = com._default_index(len(self)) - if level is not None and isinstance(self.index, MultiIndex): + if level is not None: if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if len(level) < len(self.index.levels): - new_index = self.index.droplevel(level) + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if inplace: self.index = new_index diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index dce4e82cbdcf1..859082a7e722d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -188,6 +188,11 @@ def test_reset_index_level(self): with tm.assert_raises_regex(IndexError, 'Too many levels'): s.reset_index(level=[0, 1, 2]) + # Check that .reset_index([],drop=True) doesn't fail + result = pd.Series(range(4)).reset_index([], drop=True) + expected = pd.Series(range(4)) + assert_series_equal(result, expected) + def test_reset_index_range(self): # GH 12071 s = pd.Series(range(2), name='A', dtype='int64') @@ -275,3 +280,18 @@ def test_set_axis_prior_to_deprecation_signature(self): with tm.assert_produces_warning(FutureWarning): result = s.set_axis(0, list('abcd'), inplace=False) tm.assert_series_equal(result, expected) + + def test_reset_index_drop_errors(self): + # GH 20925 + + # KeyError raised for series index when passed level name is missing + s = pd.Series(range(4)) + with tm.assert_raises_regex(KeyError, 'must be same as name'): + s.reset_index('wrong', drop=True) + with tm.assert_raises_regex(KeyError, 'must be same as name'): + s.reset_index('wrong') + + # KeyError raised for series when level to be dropped is missing + s = pd.Series(range(4), index=pd.MultiIndex.from_product([[1, 2]] * 2)) + with tm.assert_raises_regex(KeyError, 'not found'): + s.reset_index('wrong', drop=True) From 93c8a57d30859811883d3c317e785095caa1a491 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sat, 19 May 2018 23:16:10 +0530 Subject: [PATCH 008/116] Spell check (#21130) (cherry picked from commit cc8d33e00bcf7d1e0bf08f58ffae3f16d37ff118) --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index aa051c6f5eaef..c331ead8d2fef 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -114,7 +114,7 @@ def _reset_cache(self, key=None): def __sizeof__(self): """ - Generates the total memory usage for a object that returns + Generates the total memory usage for an object that returns either a value or Series of values """ if hasattr(self, 'memory_usage'): From 7afc7011eb4b5100c4c10564a2ccf40de5cfc533 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 19 May 2018 13:08:18 -0700 Subject: [PATCH 009/116] Replaced open with Context Mgrs in Parser Tests (#21105) (cherry picked from commit ed784a897047db2aab88e3cd87e62f17647289a4) --- pandas/tests/io/parser/common.py | 17 +++++++++-------- pandas/tests/io/parser/compression.py | 15 +++++++-------- pandas/tests/io/parser/test_textreader.py | 12 +++--------- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2423ddcd9a1a0..2b7ff1f5a9879 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -54,20 +54,21 @@ def test_bad_stream_exception(self): # and C engine will raise UnicodeDecodeError instead of # c engine raising ParserError and swallowing exception # that caused read to fail. - handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') - # stream must be binary UTF8 - stream = codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) + if compat.PY3: msg = "'utf-8' codec can't decode byte" else: msg = "'utf8' codec can't decode byte" - with tm.assert_raises_regex(UnicodeDecodeError, msg): - self.read_csv(stream) - stream.close() + + # stream must be binary UTF8 + with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) as stream: + + with tm.assert_raises_regex(UnicodeDecodeError, msg): + self.read_csv(stream) def test_read_csv(self): if not compat.PY3: diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 01c6620e50d37..e84db66561c49 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -110,16 +110,15 @@ def test_read_csv_infer_compression(self): # see gh-9770 expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', open(self.csv1)] + with open(self.csv1) as f: + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', f] - for f in inputs: - df = self.read_csv(f, index_col=0, parse_dates=True, - compression='infer') - - tm.assert_frame_equal(expected, df) + for inp in inputs: + df = self.read_csv(inp, index_col=0, parse_dates=True, + compression='infer') - inputs[3].close() + tm.assert_frame_equal(expected, df) def test_read_csv_compressed_utf16_example(self): # GH18071 diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index ab4c14034cd20..e8d9d8b52164b 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -35,24 +35,18 @@ def setup_method(self, method): self.xls1 = os.path.join(self.dirpath, 'test.xls') def test_file_handle(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f) - result = reader.read() # noqa - finally: - f.close() + reader.read() def test_string_filename(self): reader = TextReader(self.csv1, header=None) reader.read() def test_file_handle_mmap(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f, memory_map=True, header=None) reader.read() - finally: - f.close() def test_StringIO(self): with open(self.csv1, 'rb') as f: From c48db72b78ffe471acedd9eceb3d7b78200dd7d0 Mon Sep 17 00:00:00 2001 From: Aly Sivji <4369343+alysivji@users.noreply.github.com> Date: Sat, 19 May 2018 15:10:00 -0500 Subject: [PATCH 010/116] BUG: assert_index_equal does not raise error for check_categorical=False when comparing 2 CategoricalIndex objects (#21092) (cherry picked from commit af2b6094b93ec04c5f26c16552cf339b4d037150) --- doc/source/whatsnew/v0.23.1.txt | 5 ++++ pandas/tests/util/test_testing.py | 38 +++++++++++++++++++++++++++++++ pandas/util/testing.py | 29 +++++++++++++---------- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9382d74f95295..5a553264e828b 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -55,6 +55,11 @@ Strings - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - +Categorical +^^^^^^^^^^^ + +- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) + Conversion ^^^^^^^^^^ diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index d6f58d16bcf64..ab7c4fb528452 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -503,6 +503,25 @@ def test_index_equal_metadata_message(self): with tm.assert_raises_regex(AssertionError, expected): assert_index_equal(idx1, idx2) + def test_categorical_index_equality(self): + expected = """Index are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_index_equality_relax_categories_check(self): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertSeriesEqual(object): @@ -600,6 +619,25 @@ def test_series_equal_message(self): assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), check_less_precise=True) + def test_categorical_series_equality(self): + expected = """Attributes are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_series_equality_relax_categories_check(self): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertFrameEqual(object): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e1484a9c1b390..233eba6490937 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -778,8 +778,12 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, def _check_types(l, r, obj='Index'): if exact: - assert_class_equal(left, right, exact=exact, obj=obj) - assert_attr_equal('dtype', l, r, obj=obj) + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal('dtype', l, r, obj=obj) + # allow string-like to have different inferred_types if l.inferred_type in ('string', 'unicode'): assert r.inferred_type in ('string', 'unicode') @@ -829,7 +833,8 @@ def _get_ilevel_values(index, level): # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) - if check_exact: + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: if not left.equals(right): diff = np.sum((left.values != right.values) .astype(int)) * 100.0 / len(left) @@ -950,23 +955,23 @@ def is_sorted(seq): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical', check_category_order=True): + check_category_order=True, obj='Categorical'): """Test that Categoricals are equivalent. Parameters ---------- - left, right : Categorical - Categoricals to compare + left : Categorical + right : Categorical check_dtype : bool, default True Check that integer dtype of the codes are the same - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message check_category_order : bool, default True Whether the order of the categories should be compared, which implies identical integer codes. If False, only the resulting values are compared. The ordered attribute is checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message """ _check_isinstance(left, right, Categorical) @@ -1020,7 +1025,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - obj='numpy array', check_same=None): + check_same=None, obj='numpy array'): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1033,11 +1038,11 @@ def assert_numpy_array_equal(left, right, strict_nan=False, check dtype if both a and b are np.ndarray err_msg : str, default None If provided, used as assertion message + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area """ # instance validation From 913e46afe5a67eab804a822f619758c5c2aca385 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 19 May 2018 13:15:03 -0700 Subject: [PATCH 011/116] Fix Inconsistent MultiIndex Sorting (#21043) (cherry picked from commit bc37ea2e05019a89adaa48159b220483598d1898) --- pandas/core/frame.py | 8 +++---- pandas/core/series.py | 2 +- pandas/tests/frame/test_reshape.py | 17 +++++++++++++++ pandas/tests/frame/test_sorting.py | 34 ++++++++++++++++++++++------- pandas/tests/series/test_sorting.py | 9 ++++---- 5 files changed, 53 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 77a67c048a48d..b6c33b4f79478 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4454,7 +4454,10 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - if level: + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() + if level is not None: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -4462,9 +4465,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels._get_labels_for_sorting(), orders=ascending, na_position=na_position) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6d396e845219e..3d158f1aa9aad 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2617,7 +2617,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) index = self.index - if level: + if level is not None: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d89731dc09044..d05321abefca6 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -861,6 +861,23 @@ def test_stack_preserve_categorical_dtype(self): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("level", [0, 'baz']) + def test_unstack_swaplevel_sortlevel(self, level): + # GH 20994 + mi = pd.MultiIndex.from_product([[0], ['d', 'c']], + names=['bar', 'baz']) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) + df.columns.name = 'foo' + + expected = pd.DataFrame([ + [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ + ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ + 'baz', 'foo'])) + expected.index.name = 'bar' + + result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) + tm.assert_frame_equal(result, expected) + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b60eb89e87da5..599ae683f914b 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -550,18 +550,36 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) + mi = MultiIndex.from_tuples([ + [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 2], + [2, 1, 3]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [3, 4], + [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + assert_frame_equal(result, expected) - # MI sort, but no level: sort_level has no effect - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(sort_remaining=False) - expected = df.sort_index() + # sort_remaining=False + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 3], + [2, 1, 2]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [1, 2], + [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected) def test_sort_index_intervalindex(self): diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 01b4ea6eaa238..13e0d1b12c372 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -141,19 +141,20 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] # implicit sort_remaining=True - res = s.sort_index(level='A') + res = s.sort_index(level=level) assert_series_equal(backwards, res) # GH13496 - # rows share same level='A': sort has no effect without remaining lvls - res = s.sort_index(level='A', sort_remaining=False) + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) assert_series_equal(s, res) def test_sort_index_kind(self): From 04f5a6e074f353bacc722420daf194c7480a9c06 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Mon, 21 May 2018 16:11:19 +0530 Subject: [PATCH 012/116] BUG: Should not raise error in concatenating Series with numpy scalar and tuple names (GH21015) (#21132) (cherry picked from commit e80cc43d9eeaec088bbbe61b4bba15e2aa993aed) --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/core/common.py | 7 +++++-- pandas/tests/reshape/test_concat.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5a553264e828b..cefe4ea9be2ad 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -87,7 +87,7 @@ Plotting Reshaping ^^^^^^^^^ -- +- Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Categorical diff --git a/pandas/core/common.py b/pandas/core/common.py index b9182bfd2cbe2..1de8269c9a0c6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -55,8 +55,11 @@ def flatten(l): def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: - if obj.name != name: - return None + try: + if obj.name != name: + name = None + except ValueError: + name = None return name diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index f5e58fa70e1c4..dea305d4b3fee 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2487,3 +2487,14 @@ def test_concat_aligned_sort_does_not_raise(): columns=[1, 'a']) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("s1name,s2name", [ + (np.int64(190), (43, 0)), (190, (43, 0))]) +def test_concat_series_name_npscalar_tuple(s1name, s2name): + # GH21015 + s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) + s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) + tm.assert_series_equal(result, expected) From ea5fdb6f3a1026a8ff9d3befe1cfda63c196c5cf Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 21 May 2018 11:44:49 +0100 Subject: [PATCH 013/116] DEPR: Add deprecated index attribute names to deprecation list (#21125) (cherry picked from commit 0ebbafd13b586a7f41e089edfb509127ea00b93a) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ pandas/core/accessor.py | 3 ++- pandas/tests/indexes/test_base.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index cefe4ea9be2ad..bed21a3570cdb 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -44,6 +44,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) + Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index c638b9e4ea117..7a853d575aa69 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -12,7 +12,8 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset(['asobject']) + _deprecations = frozenset( + ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f4fa547574b9e..1e4dd2921b3f5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2088,6 +2088,17 @@ def test_get_duplicates_deprecated(self): with tm.assert_produces_warning(FutureWarning): index.get_duplicates() + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; idx = pd.Index([1, 2])" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('idx.', 4)) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From 24edb3ac80f95b790d7ee8cfb68b5101ae99c4df Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Mon, 21 May 2018 01:00:04 -1000 Subject: [PATCH 014/116] DOC: Improve the docstring of Timedelta.delta redux (#21138) (cherry picked from commit 508ec3d3686338c7ddb4b5b121c677c6864b1f80) --- pandas/_libs/tslibs/timedeltas.pyx | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f7bb6c1dbb304..3f0b4db87e5ed 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -760,7 +760,32 @@ cdef class _Timedelta(timedelta): @property def delta(self): - """ return out delta in ns (for internal compat) """ + """ + Return the timedelta in nanoseconds (ns), for internal compatibility. + + Returns + ------- + int + Timedelta in nanoseconds. + + Examples + -------- + >>> td = pd.Timedelta('1 days 42 ns') + >>> td.delta + 86400000000042 + + >>> td = pd.Timedelta('3 s') + >>> td.delta + 3000000000 + + >>> td = pd.Timedelta('3 ms 5 us') + >>> td.delta + 3005000 + + >>> td = pd.Timedelta(42, unit='ns') + >>> td.delta + 42 + """ return self.value @property From e3d7b6fdd10b70fcced49d23713fbec9bec7ad4f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 21 May 2018 04:06:03 -0700 Subject: [PATCH 015/116] BUG: Avoid Timedelta rounding when specifying unit and integer (#12690) (#19732) (cherry picked from commit 81358e862022cedbac009985abac5135a873dde3) --- doc/source/whatsnew/v0.23.1.txt | 5 +- pandas/_libs/tslibs/timedeltas.pyx | 16 ++-- pandas/tests/indexes/datetimes/test_tools.py | 8 ++ pandas/tests/io/sas/test_sas7bdat.py | 2 + .../tests/scalar/timedelta/test_timedelta.py | 10 +++ .../tests/scalar/timestamp/test_timestamp.py | 85 ++++++++++--------- 6 files changed, 79 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index bed21a3570cdb..47023daeaaa27 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -55,7 +55,10 @@ Strings ^^^^^^^ - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) -- + +Timedelta +^^^^^^^^^ +- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 3f0b4db87e5ed..4c237da7b6d0e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -202,22 +202,22 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if unit == 'D' or unit == 'd': m = 1000000000L * 86400 - p = 6 + p = 9 elif unit == 'h': m = 1000000000L * 3600 - p = 6 + p = 9 elif unit == 'm': m = 1000000000L * 60 - p = 6 + p = 9 elif unit == 's': m = 1000000000L - p = 6 + p = 9 elif unit == 'ms': m = 1000000L - p = 3 + p = 6 elif unit == 'us': m = 1000L - p = 0 + p = 3 elif unit == 'ns' or unit is None: m = 1L p = 0 @@ -231,10 +231,10 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts -base + frac = ts - base if p: frac = round(frac, p) - return (base *m) + (frac *m) + return (base * m) + (frac * m) cdef inline _decode_if_necessary(object ts): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 45be3974dad63..8b0514764b0c0 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -650,6 +650,14 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError): pd.to_datetime(arr, errors='raise', cache=cache) + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_rounding(self, cache): + # GH 14156: argument will incur floating point errors but no + # premature rounding + result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) + expected = pd.Timestamp('2015-06-19 19:55:31.877000093') + assert result == expected + @pytest.mark.parametrize('cache', [True, False]) def test_dataframe(self, cache): diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5da347e47957c..b80263021c269 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -182,6 +182,8 @@ def test_date_time(): fname = os.path.join(dirpath, "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) + # GH 19732: Timestamps imported from sas will incur floating point errors + df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 3fdc2aa71bfc0..205fdf49d3e91 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -106,6 +106,16 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): + @pytest.mark.parametrize("unit, value, expected", [ + ('us', 9.999, 9999), ('ms', 9.999999, 9999999), + ('s', 9.999999999, 9999999999)]) + def test_rounding_on_int_unit_construction(self, unit, value, expected): + # GH 12690 + result = Timedelta(value, unit=unit) + assert result.value == expected + result = Timedelta(str(value) + unit) + assert result.value == expected + def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index b022b327de57c..ab87d98fca8eb 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -621,10 +621,51 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - def test_unit(self): - - def check(val, unit=None, h=1, s=1, us=0): - stamp = Timestamp(val, unit=unit) + @pytest.mark.parametrize('value, check_kwargs', [ + [946688461000000000, {}], + [946688461000000000 / long(1000), dict(unit='us')], + [946688461000000000 / long(1000000), dict(unit='ms')], + [946688461000000000 / long(1000000000), dict(unit='s')], + [10957, dict(unit='D', h=0)], + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s', us=499, ns=964), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s', us=500000), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms', us=500), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + [(946688461000000000 + 500000) / long(1000), dict(unit='us', us=500)], + [(946688461000000000 + 500000000) / long(1000000), + dict(unit='ms', us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], + [10957 + 0.5, dict(unit='D', h=12)]]) + def test_unit(self, value, check_kwargs): + def check(value, unit=None, h=1, s=1, us=0, ns=0): + stamp = Timestamp(value, unit=unit) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.day == 1 @@ -637,41 +678,9 @@ def check(val, unit=None, h=1, s=1, us=0): assert stamp.minute == 0 assert stamp.second == 0 assert stamp.microsecond == 0 - assert stamp.nanosecond == 0 - - ts = Timestamp('20000101 01:01:01') - val = ts.value - days = (ts - Timestamp('1970-01-01')).days - - check(val) - check(val / long(1000), unit='us') - check(val / long(1000000), unit='ms') - check(val / long(1000000000), unit='s') - check(days, unit='D', h=0) + assert stamp.nanosecond == ns - # using truediv, so these are like floats - if PY3: - check((val + 500000) / long(1000000000), unit='s', us=500) - check((val + 500000000) / long(1000000000), unit='s', us=500000) - check((val + 500000) / long(1000000), unit='ms', us=500) - - # get chopped in py2 - else: - check((val + 500000) / long(1000000000), unit='s') - check((val + 500000000) / long(1000000000), unit='s') - check((val + 500000) / long(1000000), unit='ms') - - # ok - check((val + 500000) / long(1000), unit='us', us=500) - check((val + 500000000) / long(1000000), unit='ms', us=500000) - - # floats - check(val / 1000.0 + 5, unit='us', us=5) - check(val / 1000.0 + 5000, unit='us', us=5000) - check(val / 1000000.0 + 0.5, unit='ms', us=500) - check(val / 1000000.0 + 0.005, unit='ms', us=5) - check(val / 1000000000.0 + 0.5, unit='s', us=500000) - check(days + 0.5, unit='D', h=12) + check(value, **check_kwargs) def test_roundtrip(self): From 58eaecb908447046aad8ba33816e778e05c5bd1e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 21 May 2018 19:11:16 -0400 Subject: [PATCH 016/116] TST: Escape invalid escape characters (#21154) Partially addresses gh-21137. (cherry picked from commit ac32ce8fca6bdbc40ec1ca14e45e49d73b5176a5) --- pandas/core/strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 81d775157cf62..5d50c45fe7eca 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -241,7 +241,7 @@ def str_count(arr, pat, flags=0): Escape ``'$'`` to find the literal dollar sign. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\$') + >>> s.str.count('\\$') 0 1 1 0 2 1 @@ -358,7 +358,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): Returning any digit using regular expression. - >>> s1.str.contains('\d', regex=True) + >>> s1.str.contains('\\d', regex=True) 0 False 1 False 2 False From 3e91eb931fb8baa64713f3f14ac506ef7fb45cf2 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 22 May 2018 01:15:03 -0600 Subject: [PATCH 017/116] DOC: Add linspace range behavior to the timeseries/timedeltas/interval docs (#21114) * DOC: Add linspace range behavior to the timeseries/timedeltas/interval docs (cherry picked from commit 90c2237677975885a42f0d38ff59ed0f78928e7d) --- doc/source/advanced.rst | 49 +++++++++++++++++++++++++++++++++++++++ doc/source/timedeltas.rst | 42 +++++++++++++++++++++++++++++---- doc/source/timeseries.rst | 12 ++++++++++ 3 files changed, 99 insertions(+), 4 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index c81842d3d9212..ec517d3e07bdf 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -924,6 +924,55 @@ bins, with ``NaN`` representing a missing value similar to other dtypes. pd.cut([0, 3, 5, 1], bins=c.categories) + +Generating Ranges of Intervals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If we need intervals on a regular frequency, we can use the :func:`interval_range` function +to create an ``IntervalIndex`` using various combinations of ``start``, ``end``, and ``periods``. +The default frequency for ``interval_range`` is a 1 for numeric intervals, and calendar day for +datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, end=5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + + pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + +The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety +of :ref:`frequency aliases ` with datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, periods=5, freq=1.5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + + pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + +Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals +are closed on. Intervals are closed on the right side by default. + +.. ipython:: python + + pd.interval_range(start=0, end=4, closed='both') + + pd.interval_range(start=0, end=4, closed='neither') + +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements +in the resulting ``IntervalIndex``: + +.. ipython:: python + + pd.interval_range(start=0, end=6, periods=4) + + pd.interval_range(pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-28'), periods=3) + Miscellaneous indexing FAQ -------------------------- diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 5f3a01f0725d4..745810704f665 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -352,8 +352,8 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the TimedeltaIndex -------------- -To generate an index with time delta, you can use either the ``TimedeltaIndex`` or -the ``timedelta_range`` constructor. +To generate an index with time delta, you can use either the :class:`TimedeltaIndex` or +the :func:`timedelta_range` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent missing values. @@ -363,13 +363,47 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) -Similarly to ``date_range``, you can construct regular ranges of a ``TimedeltaIndex``: +Generating Ranges of Time Deltas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex`` +using :func:`timedelta_range`. The default frequency for ``timedelta_range`` is +calendar day: + +.. ipython:: python + + pd.timedelta_range(start='1 days', periods=5) + +Various combinations of ``start``, ``end``, and ``periods`` can be used with +``timedelta_range``: + +.. ipython:: python + + pd.timedelta_range(start='1 days', end='5 days') + + pd.timedelta_range(end='10 days', periods=4) + +The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.timedelta_range(start='1 days', periods=5, freq='D') pd.timedelta_range(start='1 days', end='2 days', freq='30T') + pd.timedelta_range(start='1 days', periods=5, freq='2D5H') + + +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements +in the resulting ``TimedeltaIndex``: + +.. ipython:: python + + pd.timedelta_range('0 days', '4 days', periods=5) + + pd.timedelta_range('0 days', '4 days', periods=10) + Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 73e3e721aad71..1b0cf86995a39 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -393,6 +393,18 @@ of those specified will not be generated: pd.bdate_range(start=start, periods=20) +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the +resulting ``DatetimeIndex``: + +.. ipython:: python + + pd.date_range('2018-01-01', '2018-01-05', periods=5) + + pd.date_range('2018-01-01', '2018-01-05', periods=10) + .. _timeseries.custom-freq-ranges: Custom Frequency Ranges From cc9ff276e2b05f4e6b244b0cd811271d3f00d609 Mon Sep 17 00:00:00 2001 From: zertrin Date: Wed, 23 May 2018 02:32:37 +0800 Subject: [PATCH 018/116] Small typo in deprecation message added in PR #21060 (#21170) (cherry picked from commit cd04471023d7b02dfcc168e5bdfcf1d7f960e8aa) --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4c237da7b6d0e..e2b0b33053f83 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1246,7 +1246,7 @@ class Timedelta(_Timedelta): deprecated. Use 'array // timedelta.value' instead. If you want to obtain epochs from an array of timestamps, you can rather use - 'array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'. + '(array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'. """) warnings.warn(msg, FutureWarning) return other // self.value From 59e8b9400da1b21775c845da83bc5381a2217348 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 22 May 2018 22:22:14 -0600 Subject: [PATCH 019/116] BUG: Fix invalid truncation in interval_range (#21162) (cherry picked from commit be90d492836fa604b4b914ab6c7387752a6ba9e6) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/interval.py | 8 ++++---- .../indexes/interval/test_interval_range.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 47023daeaaa27..3ffe891408f58 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -75,6 +75,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) +- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - I/O diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 408a8cc435b63..8f8d8760583ce 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1572,6 +1572,10 @@ def interval_range(start=None, end=None, periods=None, freq=None, periods += 1 if is_number(endpoint): + # force consistency between start/end/freq (lower end if freq skips it) + if com._all_not_none(start, end, freq): + end -= (end - start) % freq + # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 @@ -1580,10 +1584,6 @@ def interval_range(start=None, end=None, periods=None, freq=None, elif end is None: end = start + (periods - 1) * freq - # force end to be consistent with freq (lower if freq skips end) - if freq is not None: - end -= end % freq - breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com._not_none(start, end, freq)): # np.linspace always produces float output diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 0fadfcf0c7f28..29fe2b0185662 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -110,6 +110,8 @@ def test_constructor_timedelta(self, closed, name, freq, periods): @pytest.mark.parametrize('start, end, freq, expected_endpoint', [ (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), (Timedelta('0D'), Timedelta('10D'), '2D4H', Timedelta('8D16H')), (Timestamp('2018-01-01'), Timestamp('2018-02-09'), @@ -125,6 +127,22 @@ def test_early_truncation(self, start, end, freq, expected_endpoint): result_endpoint = result.right[-1] assert result_endpoint == expected_endpoint + @pytest.mark.parametrize('start, end, freq', [ + (0.5, None, None), + (None, 4.5, None), + (0.5, None, 1.5), + (None, 6.5, 1.5)]) + def test_no_invalid_float_truncation(self, start, end, freq): + # GH 21161 + if freq is None: + breaks = [0.5, 1.5, 2.5, 3.5, 4.5] + else: + breaks = [0.5, 2.0, 3.5, 5.0, 6.5] + expected = IntervalIndex.from_breaks(breaks) + + result = interval_range(start=start, end=end, periods=4, freq=freq) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('start, mid, end', [ (Timestamp('2018-03-10', tz='US/Eastern'), Timestamp('2018-03-10 23:30:00', tz='US/Eastern'), From 2c3f0ffd7b11b3167e52888965be08f6c19a7d82 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 May 2018 06:35:42 -0400 Subject: [PATCH 020/116] Remove deprecated Slepian test (#21173) Partially addresses gh-21137. (cherry picked from commit 1abfd1bfdb26e9f444b4f44ffbcd2e37026e6497) --- pandas/tests/test_window.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index d8e90ae0e1b35..74f2c977e0db2 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -41,7 +41,7 @@ def win_types(request): return request.param -@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', 'slepian']) +@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian']) def win_types_special(request): return request.param @@ -1079,8 +1079,7 @@ def test_cmov_window_special(self, win_types_special): kwds = { 'kaiser': {'beta': 1.}, 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'slepian': {'width': 0.5}} + 'general_gaussian': {'power': 2., 'width': 2.}} vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) @@ -1090,8 +1089,6 @@ def test_cmov_window_special(self, win_types_special): 13.65671, 12.01002, np.nan, np.nan], 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161, 13.08516, 12.95111, 12.74577, np.nan, np.nan], - 'slepian': [np.nan, np.nan, 9.81073, 10.89359, 11.70284, 12.88331, - 12.96079, 12.77008, np.nan, np.nan], 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129, 12.90702, 12.83757, np.nan, np.nan] } From c851246f045ff9ac02fa5d3c94949670e4045e79 Mon Sep 17 00:00:00 2001 From: "Dr. Irv" Date: Thu, 24 May 2018 10:59:27 -0400 Subject: [PATCH 021/116] BUG: DecimalArray and JSONArray that are empty return incorrect results for isna() (#21190) (cherry picked from commit 6f1f9759ba8319736c2d51b6d05b071998f1add6) --- pandas/tests/extension/base/missing.py | 5 +++++ pandas/tests/extension/decimal/array.py | 2 +- pandas/tests/extension/json/array.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 32cf29818e069..af26d83df3fe2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -18,6 +18,11 @@ def test_isna(self, data_missing): expected = pd.Series(expected) self.assert_series_equal(result, expected) + # GH 21189 + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=bool) + self.assert_series_equal(result, expected) + def test_dropna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.dropna() diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index e9431bd0c233c..90f0181beab0d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -90,7 +90,7 @@ def nbytes(self): return 0 def isna(self): - return np.array([x.is_nan() for x in self._data]) + return np.array([x.is_nan() for x in self._data], dtype=bool) @property def _na_value(self): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 88bb66f38b35c..10be7836cb8d7 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -108,7 +108,8 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self.dtype.na_value for x in self.data]) + return np.array([x == self.dtype.na_value for x in self.data], + dtype=bool) def take(self, indexer, allow_fill=False, fill_value=None): # re-implement here, since NumPy has trouble setting From aaa716cd16d4e9344d986de21931ce941e663b09 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 24 May 2018 23:11:45 +0100 Subject: [PATCH 022/116] BUG: Enable stata files to be written to buffers (#21169) Enable support for general file-like objects when exporting stata files closes #21041 (cherry picked from commit f91e28c3fdd4e0708e4cc2ec45a96b068ed0a44b) --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/core/frame.py | 7 +++-- pandas/io/stata.py | 54 +++++++++++++++++++++++++++------ pandas/tests/io/test_stata.py | 27 +++++++++++++++++ 4 files changed, 77 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 3ffe891408f58..e4d93b380b282 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -81,7 +81,7 @@ Indexing I/O ^^^ -- +- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Plotting diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6c33b4f79478..8a1fae90f3eeb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1774,8 +1774,11 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Parameters ---------- - fname : str or buffer - String path of file-like object. + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + data has been written. convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8f91c7a497e2d..2797924985c70 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1758,11 +1758,25 @@ def value_labels(self): return self.value_label_dict -def _open_file_binary_write(fname, encoding): +def _open_file_binary_write(fname): + """ + Open a binary file or no-op if file-like + + Parameters + ---------- + fname : string path, path object or buffer + + Returns + ------- + file : file-like object + File object supporting write + own : bool + True if the file was created, otherwise False + """ if hasattr(fname, 'write'): # if 'b' not in fname.mode: - return fname - return open(fname, "wb") + return fname, False + return open(fname, "wb"), True def _set_endianness(endianness): @@ -1899,7 +1913,9 @@ class StataWriter(StataParser): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. .. versionadded:: 0.23.0 support for pathlib, py.path. @@ -1970,6 +1986,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels + self._own_file = True # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2183,9 +2200,7 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._file = _open_file_binary_write( - self._fname, self._encoding or self._default_encoding - ) + self._file, self._own_file = _open_file_binary_write(self._fname) try: self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) @@ -2205,6 +2220,23 @@ def write_file(self): self._write_file_close_tag() self._write_map() finally: + self._close() + + def _close(self): + """ + Close the file if it was created by the writer. + + If a buffer or file-like object was passed in, for example a GzipFile, + then leave this file open for the caller to close. In either case, + attempt to flush the file contents to ensure they are written to disk + (if supported) + """ + # Some file-like objects might not support flush + try: + self._file.flush() + except AttributeError: + pass + if self._own_file: self._file.close() def _write_map(self): @@ -2374,7 +2406,7 @@ def _prepare_data(self): def _write_data(self): data = self.data - data.tofile(self._file) + self._file.write(data.tobytes()) def _null_terminate(self, s, as_string=False): null_byte = '\x00' @@ -2641,7 +2673,9 @@ class StataWriter117(StataWriter): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. data : DataFrame Input to save convert_dates : dict @@ -2879,7 +2913,7 @@ def _write_data(self): self._update_map('data') data = self.data self._file.write(b'') - data.tofile(self._file) + self._file.write(data.tobytes()) self._file.write(b'') def _write_strls(self): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 110b790a65037..f3a465da4e87f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2,6 +2,8 @@ # pylint: disable=E1101 import datetime as dt +import io +import gzip import os import struct import warnings @@ -1473,3 +1475,28 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError): original.to_stata(path, convert_dates={'wrong_name': 'tc'}) + + @pytest.mark.parametrize('version', [114, 117]) + def test_nonfile_writing(self, version): + # GH 21041 + bio = io.BytesIO() + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + df.to_stata(bio, version=version) + bio.seek(0) + with open(path, 'wb') as dta: + dta.write(bio.read()) + reread = pd.read_stata(path, index_col='index') + tm.assert_frame_equal(df, reread) + + def test_gzip_writing(self): + # writing version 117 requires seek and cannot be used with gzip + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + with gzip.GzipFile(path, 'wb') as gz: + df.to_stata(gz, version=114) + with gzip.GzipFile(path, 'rb') as gz: + reread = pd.read_stata(gz, index_col='index') + tm.assert_frame_equal(df, reread) From a7611d0072fa266799916e7bd8cd9292dd21ced8 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 25 May 2018 01:09:54 -0600 Subject: [PATCH 023/116] CLN: Remove duplicate Categorical section from 0.23.1 whatsnew (#21197) (cherry picked from commit e0f6c2281bf803889d4ac6c7f8bdfd721715665b) --- doc/source/whatsnew/v0.23.1.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index e4d93b380b282..5c7117ca099ee 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -95,8 +95,3 @@ Reshaping - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - - -Categorical -^^^^^^^^^^^ - -- From c0ee70eab5ef05d43042bcd666cdb81cccc146a0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 25 May 2018 07:32:05 -0400 Subject: [PATCH 024/116] CI: use latest deps for pandas-datareader, python-dateutil (#21204) (cherry picked from commit dc02831f7b267ef152c9bb6a1c8e39c652c1ac3c) --- ci/travis-36.yaml | 4 ++-- pandas/tests/test_downstream.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index fe057e714761e..006276ba1a65f 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -18,12 +18,10 @@ dependencies: - numexpr - numpy - openpyxl - - pandas-datareader - psycopg2 - pyarrow - pymysql - pytables - - python-dateutil - python-snappy - python=3.6* - pytz @@ -45,3 +43,5 @@ dependencies: - pip: - brotlipy - coverage + - pandas-datareader + - python-dateutil diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index a595d9f18d6b8..c28e2052bd93e 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -87,6 +87,7 @@ def test_pandas_gbq(df): pandas_gbq = import_module('pandas_gbq') # noqa +@pytest.mark.xfail(reason="0.7.0 pending") @tm.network def test_pandas_datareader(): @@ -95,6 +96,7 @@ def test_pandas_datareader(): 'F', 'quandl', '2017-01-01', '2017-02-01') +@pytest.mark.xfail(reaason="downstream install issue") def test_geopandas(): geopandas = import_module('geopandas') # noqa From 957c5a4a5b20cb44074a15e3cdf2a7f163051c3c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 28 May 2018 14:57:56 -0700 Subject: [PATCH 025/116] BUG: Categorical.fillna with iterables (#21215) Closes https://github.com/pandas-dev/pandas/issues/19788 Closes https://github.com/pandas-dev/pandas/issues/21097 (cherry picked from commit 36c1f6bfeb0d5915e1cd0bca1c91ee5672d945e7) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/arrays/categorical.py | 3 ++- pandas/tests/categorical/test_missing.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5c7117ca099ee..fd6751ac66d17 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -64,6 +64,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index abcb9ae3494b5..a1a8f098b582e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -12,6 +12,7 @@ from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.cast import ( maybe_infer_to_datetimelike, coerce_indexer_dtype) @@ -1751,7 +1752,7 @@ def fillna(self, value=None, method=None, limit=None): values[indexer] = values_codes[values_codes != -1] # If value is not a dict or Series it should be a scalar - elif is_scalar(value): + elif is_hashable(value): if not isna(value) and value not in self.categories: raise ValueError("fill value must be in categories") diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 5133c97d8b590..c78f02245a5b4 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import collections + import numpy as np import pytest @@ -68,3 +70,16 @@ def test_fillna_raises(self, fillna_kwargs, msg): with tm.assert_raises_regex(ValueError, msg): cat.fillna(**fillna_kwargs) + + @pytest.mark.parametrize("named", [True, False]) + def test_fillna_iterable_category(self, named): + # https://github.com/pandas-dev/pandas/issues/21097 + if named: + Point = collections.namedtuple("Point", "x y") + else: + Point = lambda *args: args # tuple + cat = Categorical([Point(0, 0), Point(0, 1), None]) + result = cat.fillna(Point(0, 0)) + expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) + + tm.assert_categorical_equal(result, expected) From 18ae84c5421e447b3ad6161d9c0f4299314b0f4d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 28 May 2018 18:07:44 -0700 Subject: [PATCH 026/116] DOC: Add numeric_only to DataFrame.quantile (#21214) (cherry picked from commit 7c522bf4d7e6799136eba5e07abe07190aaa4332) --- pandas/core/frame.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8a1fae90f3eeb..d29df6cc5f475 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7089,6 +7089,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0 <= q <= 1, the quantile(s) to compute axis : {0, 1, 'index', 'columns'} (default 0) 0 or 'index' for row-wise, 1 or 'columns' for column-wise + numeric_only : boolean, default True + If False, the quantile of datetime and timedelta data will be + computed as well interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} .. versionadded:: 0.18.0 @@ -7116,7 +7119,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, -------- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - columns=['a', 'b']) + columns=['a', 'b']) >>> df.quantile(.1) a 1.3 b 3.7 @@ -7126,6 +7129,20 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.1 1.3 3.7 0.5 2.5 55.0 + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + 'B': [pd.Timestamp('2010'), + pd.Timestamp('2011')], + 'C': [pd.Timedelta('1 days'), + pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + See Also -------- pandas.core.window.Rolling.quantile From 952c3640979042de2f14904f837013c8a1a83ead Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 28 May 2018 18:20:38 -0700 Subject: [PATCH 027/116] Stable Sorting Algorithm for Fillna Indexer (#21212) (cherry picked from commit d30cc746f39e8d8442849cdcccc625ea4dd036d2) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/_libs/groupby.pyx | 3 ++- pandas/tests/groupby/test_transform.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index fd6751ac66d17..109afd05136df 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -50,6 +50,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) +- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) Strings ^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 43afd1e0f5969..a6dbaff17e543 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -297,7 +297,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, # Make sure all arrays are the same size assert N == len(labels) == len(mask) - sorted_labels = np.argsort(labels).astype(np.int64, copy=False) + sorted_labels = np.argsort(labels, kind='mergesort').astype( + np.int64, copy=False) if direction == 'bfill': sorted_labels = sorted_labels[::-1] diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 626057c1ea760..7fccf1f57a886 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -721,6 +721,23 @@ def interweave(list_obj): assert_frame_equal(result, exp) +@pytest.mark.parametrize("fill_method", ['ffill', 'bfill']) +def test_pad_stable_sorting(fill_method): + # GH 21207 + x = [0] * 20 + y = [np.nan] * 10 + [1] * 10 + + if fill_method == 'bfill': + y = y[::-1] + + df = pd.DataFrame({'x': x, 'y': y}) + expected = df.copy() + + result = getattr(df.groupby('x'), fill_method)() + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), From 236069791eadf924ec934168eb787209968ec558 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Tue, 29 May 2018 07:02:24 +0530 Subject: [PATCH 028/116] BUG: Should not raise errors in .set_names for MultiIndex with nlevels == 1 (GH21149) (#21196) (cherry picked from commit a5259cc7f1ba092dccc73e0f066d5ae6ffd5ee97) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/indexes/test_multi.py | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 109afd05136df..ea9c951e4f4ea 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -78,6 +78,7 @@ Indexing - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) +- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) - I/O diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index df39eb5fd8312..90238af9b3632 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1384,7 +1384,8 @@ def set_names(self, names, level=None, inplace=False): names=[u'baz', u'bar']) """ - if level is not None and self.nlevels == 1: + from .multi import MultiIndex + if level is not None and not isinstance(self, MultiIndex): raise ValueError('Level must be None for non-MultiIndex') if level is not None and not is_list_like(level) and is_list_like( diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 37f70090c179f..182dbdf2cf4e4 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -164,6 +164,22 @@ def test_set_name_methods(self): assert res is None assert ind.names == new_names2 + @pytest.mark.parametrize('inplace', [True, False]) + def test_set_names_with_nlevel_1(self, inplace): + # GH 21149 + # Ensure that .set_names for MultiIndex with + # nlevels == 1 does not raise any errors + expected = pd.MultiIndex(levels=[[0, 1]], + labels=[[0, 1]], + names=['first']) + m = pd.MultiIndex.from_product([[0, 1]]) + result = m.set_names('first', level=0, inplace=inplace) + + if inplace: + result = m + + tm.assert_index_equal(result, expected) + def test_set_levels_labels_directly(self): # setting levels/labels directly raises AttributeError From 1d5e535f5aa05769d6447c639d6812e2401135f2 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Tue, 29 May 2018 11:41:27 +0100 Subject: [PATCH 029/116] BUG: set keyword argument so zipfile actually compresses (#21144) (cherry picked from commit c85ab083919b59ce84c220d5baf7d34ff4a0bcf2) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/io/common.py | 8 ++++---- pandas/tests/test_common.py | 21 ++++++++++++++++++++- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index ea9c951e4f4ea..c6fad1158e59a 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -84,6 +84,7 @@ Indexing I/O ^^^ +- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - diff --git a/pandas/io/common.py b/pandas/io/common.py index 0827216975f15..a492b7c0b8e8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing -from zipfile import ZipFile +import zipfile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BytesIO): +class BytesZipFile(zipfile.ZipFile, BytesIO): """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -437,10 +437,10 @@ class BytesZipFile(ZipFile, BytesIO): bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode='r', **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0b329f64dafa3..bb7ee1b911fee 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- import pytest +import os import collections from functools import partial import numpy as np -from pandas import Series, Timestamp +from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops @@ -222,3 +223,21 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(obj, method, compression): + if not compression: + pytest.skip("only test compression case.") + + with tm.ensure_clean() as filename: + getattr(obj, method)(filename, compression=compression) + compressed = os.path.getsize(filename) + getattr(obj, method)(filename, compression=None) + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed From ba16de01a567441b4e15f9c4189aceb83eb0afe2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 30 May 2018 14:04:47 -0700 Subject: [PATCH 030/116] BUG: Support for OO Optimization (#21093) (cherry picked from commit 4cbbcc648436ac21aed296206ace61da96aa7614) --- doc/source/whatsnew/v0.23.1.txt | 8 ++++-- pandas/tests/test_downstream.py | 7 +++++ pandas/tseries/offsets.py | 9 +++++-- pandas/util/_decorators.py | 47 +++++++++++++++++---------------- 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index c6fad1158e59a..3a00cfae48b54 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -44,8 +44,6 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) - Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -99,3 +97,9 @@ Reshaping - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - + +Other +^^^^^ + +- Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) +- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c28e2052bd93e..c2d09c6d49e86 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -2,6 +2,8 @@ """ Testing that we work in the downstream packages """ +import subprocess + import pytest import numpy as np # noqa from pandas import DataFrame @@ -53,6 +55,11 @@ def test_xarray(df): assert df.to_xarray() is not None +def test_oo_optimizable(): + # GH 21071 + subprocess.check_call(["python", "-OO", "-c", "import pandas"]) + + @tm.network def test_statsmodels(): diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 749165f894819..c294110d89ec5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1090,12 +1090,17 @@ def apply(self, other): class CustomBusinessMonthEnd(_CustomBusinessMonth): - __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'end') + # TODO(py27): Replace condition with Subsitution after dropping Py27 + if _CustomBusinessMonth.__doc__: + __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'end') _prefix = 'CBM' class CustomBusinessMonthBegin(_CustomBusinessMonth): - __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'beginning') + # TODO(py27): Replace condition with Subsitution after dropping Py27 + if _CustomBusinessMonth.__doc__: + __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', + 'beginning') _prefix = 'CBMS' diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 624fbbbd4f05e..6b55554cdc941 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -4,7 +4,7 @@ import types import warnings from textwrap import dedent, wrap -from functools import wraps, update_wrapper +from functools import wraps, update_wrapper, WRAPPER_ASSIGNMENTS def deprecate(name, alternative, version, alt_name=None, @@ -20,18 +20,18 @@ def deprecate(name, alternative, version, alt_name=None, Parameters ---------- name : str - Name of function to deprecate - alternative : str - Name of function to use instead + Name of function to deprecate. + alternative : func + Function to use instead. version : str - Version of pandas in which the method has been deprecated + Version of pandas in which the method has been deprecated. alt_name : str, optional - Name to use in preference of alternative.__name__ + Name to use in preference of alternative.__name__. klass : Warning, default FutureWarning stacklevel : int, default 2 msg : str - The message to display in the warning. - Default is '{name} is deprecated. Use {alt_name} instead.' + The message to display in the warning. + Default is '{name} is deprecated. Use {alt_name} instead.' """ alt_name = alt_name or alternative.__name__ @@ -39,25 +39,26 @@ def deprecate(name, alternative, version, alt_name=None, warning_msg = msg or '{} is deprecated, use {} instead'.format(name, alt_name) - @wraps(alternative) + # adding deprecated directive to the docstring + msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) + msg = '\n '.join(wrap(msg, 70)) + + @Substitution(version=version, msg=msg) + @Appender(alternative.__doc__) def wrapper(*args, **kwargs): + """ + .. deprecated:: %(version)s + + %(msg)s + + """ warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) - # adding deprecated directive to the docstring - msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) - tpl = dedent(""" - .. deprecated:: {version} - - {msg} - - {rest} - """) - rest = getattr(wrapper, '__doc__', '') - docstring = tpl.format(version=version, - msg='\n '.join(wrap(msg, 70)), - rest=dedent(rest)) - wrapper.__doc__ = docstring + # Since we are using Substitution to create the required docstring, + # remove that from the attributes that should be assigned to the wrapper + assignments = tuple(x for x in WRAPPER_ASSIGNMENTS if x != '__doc__') + update_wrapper(wrapper, alternative, assigned=assignments) return wrapper From 5dfa9fddd772218c0fa1a332e84ee3a4f87ef678 Mon Sep 17 00:00:00 2001 From: Gabe Fernando Date: Thu, 31 May 2018 06:19:18 -0400 Subject: [PATCH 031/116] DOC: fill in class names for rename methods (#21268) (cherry picked from commit 7647969df247727afdcbcfa52169f4436d6ef377) --- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d29df6cc5f475..9f6e834f0a25f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3721,7 +3721,7 @@ def rename(self, *args, **kwargs): copy : boolean, default True Also copy underlying data inplace : boolean, default False - Whether to return a new %(klass)s. If True then value of copy is + Whether to return a new DataFrame. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d158f1aa9aad..c5caafa07fb8e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3269,7 +3269,7 @@ def rename(self, index=None, **kwargs): copy : boolean, default True Also copy underlying data inplace : boolean, default False - Whether to return a new %(klass)s. If True then value of copy is + Whether to return a new Series. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified From ea737c7be0993f3a52ad79d7e92614cb9428c0b3 Mon Sep 17 00:00:00 2001 From: nprad Date: Thu, 31 May 2018 05:27:32 -0500 Subject: [PATCH 032/116] BUG: Fix inconsistency between the shape properties of SparseSeries and SparseArray (#21126) (#21198) (cherry picked from commit 5348e06c4e9e8a03cbd0011483d2dd087e850940) --- doc/source/whatsnew/v0.23.1.txt | 5 +++++ pandas/core/sparse/array.py | 5 +++++ pandas/tests/sparse/test_array.py | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 3a00cfae48b54..05f41b1234eaf 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -65,6 +65,11 @@ Categorical - Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +Sparse +^^^^^^ + +- Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) + Conversion ^^^^^^^^^^ diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 5532d7522cd2d..ff58f7d104ff9 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -290,6 +290,7 @@ def __reduce__(self): """Necessary for making this object picklable""" object_state = list(np.ndarray.__reduce__(self)) subclass_state = self.fill_value, self.sp_index + object_state[2] = self.sp_values.__reduce__()[2] object_state[2] = (object_state[2], subclass_state) return tuple(object_state) @@ -339,6 +340,10 @@ def values(self): output.put(int_index.indices, self) return output + @property + def shape(self): + return (len(self),) + @property def sp_values(self): # caching not an option, leaks memory diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 6c0c83cf65ff7..b3330f866ba1f 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -454,6 +454,17 @@ def test_values_asarray(self): assert_almost_equal(self.arr.to_dense(), self.arr_data) assert_almost_equal(self.arr.sp_values, np.asarray(self.arr)) + @pytest.mark.parametrize('data,shape,dtype', [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (['A', 'A', np.nan, 'B'], (4,), np.object) + ]) + def test_shape(self, data, shape, dtype): + # GH 21126 + out = SparseArray(data, dtype=dtype) + assert out.shape == shape + def test_to_dense(self): vals = np.array([1, np.nan, np.nan, 3, np.nan]) res = SparseArray(vals).to_dense() From 110cf9586d08b217252dd59f84b52bd22a8459c1 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Fri, 1 Jun 2018 04:44:52 +0800 Subject: [PATCH 033/116] BUG: make dense ranks results scale to 100 percent (#21203) (cherry picked from commit b237b11ba9f7e0465642fd0286b2a483289eaad0) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/_libs/groupby_helper.pxi.in | 18 ++++++++++++------ pandas/tests/groupby/test_rank.py | 14 +++++++------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 05f41b1234eaf..b3c1dbc86525d 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -49,6 +49,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) +- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` Strings ^^^^^^^ diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6a33e4a09476d..b3e9b7c9e69ee 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): """ - Provides the rank of values within each group. + Provides the rank of values within each group. Parameters ---------- @@ -451,8 +451,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0 + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t] _as ndarray[float64_t, ndim=2] grp_sizes ndarray[{{c_type}}] masked_vals @@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 + grp_tie_count +=1 # Similar to the previous conditional, check now if we are moving # to a new group. If so, keep track of the index where the new @@ -571,11 +572,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # (used by pct calculations later). also be sure to reset any of # the items helping to calculate dups if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - val_start = i + 1 + grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 6ad8b4905abff..203c3c73bec94 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -59,9 +59,9 @@ def test_rank_apply(): ('first', False, False, [3., 4., 1., 5., 2.]), ('first', False, True, [.6, .8, .2, 1., .4]), ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]), ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [.6, .6, .2, .6, .4]), + ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]), ]) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) @@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06'), np.nan, np.nan] @@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): ('dense', True, 'keep', False, [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), ('dense', True, 'keep', True, - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]), ('dense', False, 'keep', False, [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), ('dense', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]), ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), ('average', True, 'no_na', True, [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), @@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), ('dense', True, 'no_na', True, - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]), ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), ('dense', False, 'no_na', True, - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.]) ]) def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): From 2a18a97b3078fa65b5be67434d1fb182066844c3 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Thu, 31 May 2018 19:14:33 -0500 Subject: [PATCH 034/116] Add missing period to get_dummies docs (#21277) (cherry picked from commit 4274b840e64374a39a0285c2174968588753ec35) --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0829aa8f5a509..2757e0797a410 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -725,7 +725,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None - String to append DataFrame column names + String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. From 3e3e35ff084d412d158eb7de06febcd5e9b44c4b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Jun 2018 11:53:06 +0200 Subject: [PATCH 035/116] CI: revert skip of geopandas downstream test (#21217) (cherry picked from commit 88c3f08d9b031f6559b9db6574ec02da5f81f6a8) --- pandas/tests/test_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c2d09c6d49e86..afd7993fefc70 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -103,7 +103,6 @@ def test_pandas_datareader(): 'F', 'quandl', '2017-01-01', '2017-02-01') -@pytest.mark.xfail(reaason="downstream install issue") def test_geopandas(): geopandas = import_module('geopandas') # noqa From 28ba35a3a06eda564358dd615430baf2f7123ca5 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 4 Jun 2018 22:43:16 +0100 Subject: [PATCH 036/116] Improve performance of CategoricalIndex.is_unique (#21107) (cherry picked from commit 9f95f7dbffef7752175ca9ed918314cb6f0b9b18) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/indexes/category.py | 2 +- pandas/tests/indexes/test_category.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index b3c1dbc86525d..64a98de9c2bf7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -30,6 +30,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) +- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) - - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 78b7ae7054248..150eca32e229d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -378,7 +378,7 @@ def _engine(self): # introspection @cache_readonly def is_unique(self): - return not self.duplicated().any() + return self._engine.is_unique @property def is_monotonic_increasing(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 0e630f69b1a32..a2a4170256088 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -581,6 +581,15 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing + @pytest.mark.parametrize('values, expected', [ + ([1, 2, 3], True), + ([1, 3, 1], False), + (list('abc'), True), + (list('aba'), False)]) + def test_is_unique(self, values, expected): + ci = CategoricalIndex(values) + assert ci.is_unique is expected + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') From 0a6fc083d210a9771a5366620b973768a6954fa4 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 5 Jun 2018 02:04:02 -0700 Subject: [PATCH 037/116] DOC: whatsnew note for MultiIndex Sorting Fix (#21316) (cherry picked from commit 15b39cdb2ee521964a00308f09d45f92be2feaf5) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 64a98de9c2bf7..1d7ef963d1153 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -84,6 +84,8 @@ Indexing - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) +- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) - I/O From 9370d560bd3761928af523c68dfc864ef9a2f418 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Wed, 6 Jun 2018 17:08:22 +0200 Subject: [PATCH 038/116] DOC: fix mistake in Series.str.cat (#21330) (cherry picked from commit 0c65c57a279e755ab7093db925d1e580f9878dae) --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5d50c45fe7eca..44811781837bc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Returns ------- - concat : str if `other is None`, Series/Index of objects if `others is - not None`. In the latter case, the result will remain categorical - if the calling Series/Index is categorical. + concat : str or Series/Index of objects + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. See Also -------- From 55060215aabdf3e57d751944bdfa03356b88b26b Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 7 Jun 2018 05:39:24 -0400 Subject: [PATCH 039/116] BUG: Using DatetimeIndex.date with timezone returns incorrect date (#21281) * BUG: Using DatetimeIndex.date with timezone returns incorrect date #21230 * Fix bug where DTI.time returns a tz-aware Time instead of tz-naive #21267 (cherry picked from commit a363e1a920d93d41bc87cb70afe35d030cc6bf9a) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ pandas/_libs/tslib.pyx | 2 +- pandas/core/indexes/datetimes.py | 22 +++++++++++++-- .../tests/indexes/datetimes/test_timezones.py | 28 ++++++++++++++++++- 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 1d7ef963d1153..6b7ca4ca0ca7e 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -84,8 +84,10 @@ Indexing - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :attr:`DatetimeIndex.date` where an incorrect date is returned when the input date has a non-UTC timezone (:issue:`21230`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) +- Bug in :attr:`DatetimeIndex.time` where given a tz-aware Timestamp, a tz-aware Time is returned instead of tz-naive (:issue:`21267`) - I/O diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 17453d8af1297..0f58cfa761f21 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -77,7 +77,7 @@ cdef inline object create_time_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): """ convenience routine to construct a datetime.time from its parts """ - return time(dts.hour, dts.min, dts.sec, dts.us, tz) + return time(dts.hour, dts.min, dts.sec, dts.us) def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 83950f1d71633..0ddf33cdcae73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2032,7 +2032,16 @@ def time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ - return libts.ints_to_pydatetime(self.asi8, self.tz, box="time") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="time") @property def date(self): @@ -2040,7 +2049,16 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return libts.ints_to_pydatetime(self.normalize().asi8, box="date") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="date") def normalize(self): """ diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 09210d8b64d1b..573940edaa08f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -2,7 +2,7 @@ """ Tests for DatetimeIndex timezone-related methods """ -from datetime import datetime, timedelta, tzinfo +from datetime import datetime, timedelta, tzinfo, date, time from distutils.version import LooseVersion import pytest @@ -706,6 +706,32 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz.zone == 'UTC' + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_date_accessor(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_time_accessor(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + def test_dti_drop_dont_lose_tz(self): # GH#2621 ind = date_range("2012-12-01", periods=10, tz="utc") From c663deee750ef611bcc079077e9776c82746bad9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 16:21:08 -0500 Subject: [PATCH 040/116] BUG: Fixed concat warning message (#21362) (cherry picked from commit 649bfae90f70e8ee7181aba31b0f0b44f09b76e6) --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/core/indexes/api.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 6b7ca4ca0ca7e..cb44bec9ed092 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -107,7 +107,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) -- +- Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other ^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f9501cd2f9ddf..6f4fdfe5bf5cd 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -24,9 +24,9 @@ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. -To accept the future behavior, pass 'sort=True'. +To accept the future behavior, pass 'sort=False'. -To retain the current behavior and silence the warning, pass sort=False +To retain the current behavior and silence the warning, pass 'sort=True'. """) From 1391fba7aeb10e84e4d5df8327ecccc085194850 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:25:37 +0200 Subject: [PATCH 041/116] Revert "enable multivalues insert (#19664)" (#21355) This reverts commit 7c7bd569ce8e0f117c618d068e3d2798134dbc73. (cherry picked from commit c460710f32193c65e33d366921f9eaf919bc8da4) --- doc/source/io.rst | 8 -------- doc/source/whatsnew/v0.23.1.txt | 29 ++++++++++++++++------------- pandas/io/sql.py | 28 +++------------------------- pandas/tests/io/test_sql.py | 26 -------------------------- 4 files changed, 19 insertions(+), 72 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index aa2484b0cb5c3..d818f486ad62d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4719,14 +4719,6 @@ writes ``data`` to the database in batches of 1000 rows at a time: data.to_sql('data_chunked', engine, chunksize=1000) -.. note:: - - The function :func:`~pandas.DataFrame.to_sql` will perform a multivalue - insert if the engine dialect ``supports_multivalues_insert``. This will - greatly speed up the insert in some cases. - -SQL data types -++++++++++++++ :func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate SQL data type based on the dtype of the data. When you have columns of dtype diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index cb44bec9ed092..c5334338176aa 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,19 +10,22 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0231.enhancements: - -New features -~~~~~~~~~~~~ - - -.. _whatsnew_0231.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- -- +.. _whatsnew_0231.fixed_regressions: + +Fixed Regressions + +- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue + inserts as this caused regression in certain cases (:issue:`21103`). + In the future this will be made configurable. +- Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time` + attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned + a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` + returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). +- Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values + in nested levels in JSON (:issue:`21158`). +- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) .. _whatsnew_0231.performance: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ccb8d2d99d734..a582d32741ae9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -572,29 +572,8 @@ def create(self): else: self._execute_create() - def insert_statement(self, data, conn): - """ - Generate tuple of SQLAlchemy insert statement and any arguments - to be executed by connection (via `_execute_insert`). - - Parameters - ---------- - conn : SQLAlchemy connectable(engine/connection) - Connection to recieve the data - data : list of dict - The data to be inserted - - Returns - ------- - SQLAlchemy statement - insert statement - *, optional - Additional parameters to be passed when executing insert statement - """ - dialect = getattr(conn, 'dialect', None) - if dialect and getattr(dialect, 'supports_multivalues_insert', False): - return self.table.insert(data), - return self.table.insert(), data + def insert_statement(self): + return self.table.insert() def insert_data(self): if self.index is not None: @@ -633,9 +612,8 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - """Insert data into this table with database connection""" data = [{k: v for k, v in zip(keys, row)} for row in data_iter] - conn.execute(*self.insert_statement(data, conn)) + conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): keys, data_list = self.insert_data() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4530cc9d2fba9..f3ab74d37a2bc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1665,29 +1665,6 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) - def test_insert_multivalues(self): - # issues addressed - # https://github.com/pandas-dev/pandas/issues/14315 - # https://github.com/pandas-dev/pandas/issues/8953 - - db = sql.SQLDatabase(self.conn) - df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]}) - table = sql.SQLTable("test_table", db, frame=df) - data = [ - {'A': 1, 'B': 0.46}, - {'A': 0, 'B': -2.06} - ] - statement = table.insert_statement(data, conn=self.conn)[0] - - if self.supports_multivalues_insert: - assert statement.parameters == data, ( - 'insert statement should be multivalues' - ) - else: - assert statement.parameters is None, ( - 'insert statement should not be multivalues' - ) - class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): @@ -1702,7 +1679,6 @@ class _TestSQLiteAlchemy(object): """ flavor = 'sqlite' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1751,7 +1727,6 @@ class _TestMySQLAlchemy(object): """ flavor = 'mysql' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1821,7 +1796,6 @@ class _TestPostgreSQLAlchemy(object): """ flavor = 'postgresql' - supports_multivalues_insert = True @classmethod def connect(cls): From 25a18b7a12f7a3f329b33d9ef041b07949db7d00 Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Fri, 8 Jun 2018 21:55:51 +0530 Subject: [PATCH 042/116] BUG: invalid rolling window on empty input (#21291) (cherry picked from commit 93be27d6c5354f2a1daa10ac9cbe8f78934ea455) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/window.py | 4 ++-- pandas/tests/test_window.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index c5334338176aa..f41dd61d392ae 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -54,6 +54,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` +- Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) Strings ^^^^^^^ diff --git a/pandas/core/window.py b/pandas/core/window.py index 015e7f7913ed0..9d0f9dc4f75f9 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -602,8 +602,8 @@ def validate(self): if isinstance(window, (list, tuple, np.ndarray)): pass elif is_integer(window): - if window < 0: - raise ValueError("window must be non-negative") + if window <= 0: + raise ValueError("window must be > 0 ") try: import scipy.signal as sig except ImportError: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 74f2c977e0db2..cfd88f41f855e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -389,8 +389,8 @@ def test_constructor(self, which): c(window=2, min_periods=1, center=False) # GH 13383 - c(0) with pytest.raises(ValueError): + c(0) c(-1) # not valid @@ -409,7 +409,6 @@ def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling - c(0, win_type='boxcar') with pytest.raises(ValueError): c(-1, win_type='boxcar') From 53f2d9f27a279b5113cc7829c451b8b67847c751 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Jun 2018 23:20:16 +0200 Subject: [PATCH 043/116] DOC: clean-up 0.23.1 whatsnew (#21368) (cherry picked from commit 5bbbaf6ae48681699cfbdf8f4a726661118e0dcb) --- doc/source/whatsnew/v0.23.1.txt | 45 +++------------------------------ 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index f41dd61d392ae..12608f677d22c 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -13,6 +13,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0231.fixed_regressions: Fixed Regressions +~~~~~~~~~~~~~~~~~ - Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue inserts as this caused regression in certain cases (:issue:`21103`). @@ -34,14 +35,7 @@ Performance Improvements - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) - Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) -- -- -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- .. _whatsnew_0231.bug_fixes: @@ -49,72 +43,41 @@ Bug Fixes ~~~~~~~~~ Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) -Strings -^^^^^^^ +Data-type specific - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - -Timedelta -^^^^^^^^^ - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - -Categorical -^^^^^^^^^^^ - -- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) Sparse -^^^^^^ - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) -Conversion -^^^^^^^^^^ - -- -- - Indexing -^^^^^^^^ - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :attr:`DatetimeIndex.date` where an incorrect date is returned when the input date has a non-UTC timezone (:issue:`21230`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -- Bug in :attr:`DatetimeIndex.time` where given a tz-aware Timestamp, a tz-aware Time is returned instead of tz-naive (:issue:`21267`) -- I/O -^^^ - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- - -Plotting -^^^^^^^^ - -- -- +- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) Reshaping -^^^^^^^^^ - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) Other -^^^^^ - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) -- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) From 88260eaa99a08c23f1baac60f29b7889f4a7018d Mon Sep 17 00:00:00 2001 From: Damini Satya Date: Fri, 8 Jun 2018 09:50:20 -0700 Subject: [PATCH 044/116] Fix #21356: JSON nested_to_record Silently Drops Top-Level None Values (#21363) (cherry picked from commit ff2663247c2445677f27f3f46fe14f3ef265ce2d) --- doc/source/whatsnew/v0.23.1.txt | 5 ++ pandas/io/json/normalize.py | 2 - pandas/tests/io/json/test_normalize.py | 75 +++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 12608f677d22c..020eebd414ac7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -27,6 +27,11 @@ Fixed Regressions - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) +- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing + values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) +- Fixed Regression in :func:`nested_to_record` which now flattens list of dictionaries and doesnot drop keys with value as `None` (:issue:`21356`) + .. _whatsnew_0231.performance: diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 549204abd3caf..b845a43b9ca9e 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -80,8 +80,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v - if v is None: # pop the key if the value is None - new_d.pop(k) continue else: v = new_d.pop(k) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 0fabaf747b6de..395c2c90767d3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -238,15 +238,16 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): - # GH20030: Checks for robustness of json_normalize - should - # unnest records where only the first record has a None value + # GH20030: result = json_normalize(author_missing_data) ex_data = [ - {'author_name.first': np.nan, + {'info': np.nan, + 'author_name.first': np.nan, 'author_name.last_name': np.nan, 'info.created_at': np.nan, 'info.last_updated': np.nan}, - {'author_name.first': 'Jane', + {'info': None, + 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'} @@ -351,9 +352,8 @@ def test_json_normalize_errors(self): errors='raise' ) - def test_nonetype_dropping(self): - # GH20030: Checks that None values are dropped in nested_to_record - # to prevent additional columns of nans when passed to DataFrame + def test_donot_drop_nonevalues(self): + # GH21356 data = [ {'info': None, 'author_name': @@ -367,7 +367,8 @@ def test_nonetype_dropping(self): ] result = nested_to_record(data) expected = [ - {'author_name.first': 'Smith', + {'info': None, + 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', @@ -375,3 +376,61 @@ def test_nonetype_dropping(self): 'info.last_updated': '26/05/2012'}] assert result == expected + + def test_nonetype_top_level_bottom_level(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "country": { + "state": { + "id": None, + "town.info": { + "id": None, + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected + + def test_nonetype_multiple_levels(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "id": None, + "country": { + "id": None, + "state": { + "id": None, + "town.info": { + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.id': None, + 'location.country.id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected From a75ec8e73a252699957e4f4b7f27724e7af349fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Jun 2018 11:54:36 -0500 Subject: [PATCH 045/116] Revert change to comparison op with datetime.date objects (#21361) (cherry picked from commit d79203af0552e73933e6f80f4284ac2697372eaa) --- doc/source/whatsnew/v0.23.1.txt | 42 ++++++++++++++++++++++++++ pandas/core/ops.py | 30 ++++++++++++++++++ pandas/tests/series/test_arithmetic.py | 40 ++++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 020eebd414ac7..80526358f1d3d 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -15,6 +15,48 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ +**Comparing Series with datetime.date** + +We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`). +In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring. +This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal. + +In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning. + +We've temporarily restored the 0.22.0 behavior, so datetimes and dates may again compare equal, but restore the 0.23.0 behavior in a future release. + +To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1: + +.. code-block:: python + + # 0.22.0... Silently coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 True + 1 False + dtype: bool + + # 0.23.0... Do not coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 False + 1 False + dtype: bool + + # 0.23.1... Coerce the datetime.date with a warning + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + /bin/python:1: FutureWarning: Comparing Series of datetimes with 'datetime.date'. Currently, the + 'datetime.date' is coerced to a datetime. In the future pandas will + not coerce, and the values not compare equal to the 'datetime.date'. + To retain the current behavior, convert the 'datetime.date' to a + datetime with 'pd.Timestamp'. + #!/bin/python3 + 0 True + 1 False + dtype: bool + +In addition, ordering comparisons will raise a ``TypeError`` in the future. + +**Other Fixes** + - Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue inserts as this caused regression in certain cases (:issue:`21103`). In the future this will be made configurable. diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e14f82906cd06..540ebeee438f6 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -5,7 +5,10 @@ """ # necessary to enforce truediv in Python 2.X from __future__ import division +import datetime import operator +import textwrap +import warnings import numpy as np import pandas as pd @@ -1197,8 +1200,35 @@ def wrapper(self, other, axis=None): if is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior + if (isinstance(other, datetime.date) and + not isinstance(other, datetime.datetime)): + # https://github.com/pandas-dev/pandas/issues/21152 + # Compatibility for difference between Series comparison w/ + # datetime and date + msg = ( + "Comparing Series of datetimes with 'datetime.date'. " + "Currently, the 'datetime.date' is coerced to a " + "datetime. In the future pandas will not coerce, " + "and {future}. " + "To retain the current behavior, " + "convert the 'datetime.date' to a datetime with " + "'pd.Timestamp'." + ) + + if op in {operator.lt, operator.le, operator.gt, operator.ge}: + future = "a TypeError will be raised" + else: + future = ( + "'the values will not compare equal to the " + "'datetime.date'" + ) + msg = '\n'.join(textwrap.wrap(msg.format(future=future))) + warnings.warn(msg, FutureWarning, stacklevel=2) + other = pd.Timestamp(other) + res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) + return self._constructor(res_values, index=self.index, name=res_name) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ec0d7296e540e..95836f046195a 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -88,6 +88,46 @@ def test_ser_cmp_result_names(self, names, op): class TestTimestampSeriesComparison(object): + def test_dt64_ser_cmp_date_warning(self): + # https://github.com/pandas-dev/pandas/issues/21359 + # Remove this test and enble invalid test below + ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + date = ser.iloc[0].to_pydatetime().date() + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser == date + expected = pd.Series([True] + [False] * 9, name='dates') + tm.assert_series_equal(result, expected) + assert "Comparing Series of datetimes " in str(m[0].message) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser != date + tm.assert_series_equal(result, ~expected) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser <= date + tm.assert_series_equal(result, expected) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser < date + tm.assert_series_equal(result, pd.Series([False] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser >= date + tm.assert_series_equal(result, pd.Series([True] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser > date + tm.assert_series_equal(result, pd.Series([False] + [True] * 9, + name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + @pytest.mark.skip(reason="GH-21359") def test_dt64ser_cmp_date_invalid(self): # GH#19800 datetime.date comparison raises to # match DatetimeIndex/Timestamp. This also matches the behavior From e841daa161f541b7caa451838cff08bc91f4c9f2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Jun 2018 11:27:13 -0500 Subject: [PATCH 046/116] REGR: NA-values in ctors with string dtype (#21366) (cherry picked from commit 636dd01fdacba0c8f0e7b5aaa726165983fc861d) --- pandas/conftest.py | 11 +++++++ pandas/core/dtypes/cast.py | 42 ++++++++++++++++++++++++ pandas/core/series.py | 4 ++- pandas/tests/dtypes/test_cast.py | 13 ++++++++ pandas/tests/frame/test_constructors.py | 11 +++++++ pandas/tests/frame/test_dtypes.py | 16 +++++---- pandas/tests/series/test_constructors.py | 26 +++++++++++---- 7 files changed, 110 insertions(+), 13 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b09cb872a12fb..e6c1b1b171045 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -149,3 +149,14 @@ def tz_aware_fixture(request): Fixture for trying explicit timezones: {0} """ return request.param + + +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e4ed6d544d42e..ebc7a13234a98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values): result = np.empty(len(values), dtype='object') result[:] = values return result + + +def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): + """ + Construct a new ndarray, coercing `values` to `dtype`, preserving NA. + + Parameters + ---------- + values : Sequence + dtype : numpy.dtype, optional + copy : bool, default False + Note that copies may still be made with ``copy=False`` if casting + is required. + + Returns + ------- + arr : ndarray[dtype] + + Examples + -------- + >>> np.array([1.0, 2.0, None], dtype='str') + array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str') + + + """ + subarr = np.array(values, dtype=dtype, copy=copy) + + if dtype is not None and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(values) + subarr2 = subarr.astype(object) + subarr2[na_values] = np.asarray(values, dtype=object)[na_values] + subarr = subarr2 + + return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index c5caafa07fb8e..6975dd8fc918e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -40,6 +40,7 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, construct_1d_object_array_from_listlike) from pandas.core.dtypes.missing import ( isna, @@ -4047,7 +4048,8 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = np.array(subarr, dtype=dtype, copy=copy) + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 20cd8b43478d2..4a19682e2c558 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,6 +23,7 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, + construct_1d_ndarray_preserving_na, construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) + + +@pytest.mark.parametrize('values, dtype, expected', [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (['1', '2', None], None, np.array(['1', '2', None])), + (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), + ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), +]) +def test_construct_1d_ndarray_preserving_na(values, dtype, expected): + result = construct_1d_ndarray_preserving_na(values, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6dd38187f7277..70dd358248bc4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({'A': ['x', None]}, dtype=string_dtype) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + + df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) + assert np.isnan(df.iloc[1, 0]) + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 4c9f8c2ea0980..1eeeec0be3b8b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - for dtype in ['str', str, 'U']: - result = DataFrame({'A': input_vals}, dtype=dtype) - expected = DataFrame({'A': input_vals}).astype({'A': dtype}) - assert_frame_equal(result, expected) + result = DataFrame({'A': input_vals}, dtype=string_dtype) + expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + assert_frame_equal(result, expected) class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7e59325c32ddc..906d2aacd5586 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(['x', None], dtype=string_dtype) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + + ser = Series(['x', np.nan], dtype=string_dtype) + assert np.isnan(ser.iloc[1]) + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -164,22 +175,25 @@ def test_constructor_list_like(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + assert_series_equal(result, expected) - for dtype in ['str', str, 'U']: - result = Series(input_vals, dtype=dtype) - expected = Series(input_vals).astype(dtype) - assert_series_equal(result, expected) + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(['1.0', '2.0', np.nan], dtype=object) + assert_series_equal(result, expected) + assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) From 4807bce9477aa4a3140b89252138dbee2551444f Mon Sep 17 00:00:00 2001 From: Pyry Kovanen Date: Sat, 9 Jun 2018 02:40:03 +0300 Subject: [PATCH 047/116] BUG: Fix empty Data frames to JSON round-trippable back to data frames (#21318) (cherry picked from commit 415012f4f38ca0cf41717c51e49bd2349cba09a8) --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/io/json/table_schema.py | 2 +- pandas/tests/io/json/test_json_table_schema.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 80526358f1d3d..3bbacd909c603 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -119,6 +119,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) +- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) Reshaping diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 01f7db7d68664..5cea64388bdd7 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -296,7 +296,7 @@ def parse_table_schema(json, precise_float): """ table = loads(json, precise_float=precise_float) col_order = [field['name'] for field in table['schema']['fields']] - df = DataFrame(table['data'])[col_order] + df = DataFrame(table['data'], columns=col_order)[col_order] dtypes = {field['name']: convert_json_field_to_pandas_type(field) for field in table['schema']['fields']} diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 49b39c17238ae..b6483d0e978ba 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -560,3 +560,16 @@ def test_multiindex(self, index_names): out = df.to_json(orient="table") result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("strict_check", [ + pytest.param(True, marks=pytest.mark.xfail), False]) + def test_empty_frame_roundtrip(self, strict_check): + # GH 21287 + df = pd.DataFrame([], columns=['a', 'b', 'c']) + expected = df.copy() + out = df.to_json(orient='table') + result = pd.read_json(out, orient='table') + # TODO: When DF coercion issue (#21345) is resolved tighten type checks + tm.assert_frame_equal(expected, result, + check_dtype=strict_check, + check_index_type=strict_check) From daaade45447eadae66e29a094c36bbdb67259d3c Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Fri, 8 Jun 2018 18:32:20 -0500 Subject: [PATCH 048/116] BLD: include dll in package_data on Windows (#21321) (cherry picked from commit 324b324f91021e57106ffc7937f35d54279aac5c) --- doc/source/whatsnew/v0.23.1.txt | 1 + setup.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 3bbacd909c603..db9a23dc66ef0 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -129,3 +129,4 @@ Reshaping Other - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) +- Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) diff --git a/setup.py b/setup.py index 6febe674fb2a1..90ec8e91a0700 100755 --- a/setup.py +++ b/setup.py @@ -453,10 +453,10 @@ def pxd(name): return pjoin('pandas', name + '.pxd') -# args to ignore warnings if is_platform_windows(): extra_compile_args = [] else: + # args to ignore warnings extra_compile_args = ['-Wno-unused-function'] lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h', @@ -733,7 +733,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*'], + package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], 'pandas.tests.io': ['data/legacy_hdf/*.h5', 'data/legacy_pickle/*/*.pickle', 'data/legacy_msgpack/*/*.msgpack', From 9440efe996a54427dbafde445f31ce41442eeeae Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jun 2018 19:44:17 +0200 Subject: [PATCH 049/116] REGR: allow merging on object boolean columns (#21310) (cherry picked from commit 8d5032a8c7b00d47fe5d0886145e1ad9dd17e0d3) --- doc/source/whatsnew/v0.23.1.txt | 5 ++--- pandas/core/reshape/merge.py | 10 ++++++++-- pandas/tests/reshape/merge/test_merge.py | 23 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db9a23dc66ef0..0017372add683 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -65,15 +65,14 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future. a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). - Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values - in nested levels in JSON (:issue:`21158`). + in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`). - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) - Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) - Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) -- Fixed Regression in :func:`nested_to_record` which now flattens list of dictionaries and doesnot drop keys with value as `None` (:issue:`21356`) - +- Fixed regression in merging on boolean index/columns (:issue:`21119`). .. _whatsnew_0231.performance: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4d8897fb7c811..d69d79ca9b098 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,6 +28,7 @@ is_int_or_datetime_dtype, is_dtype_equal, is_bool, + is_bool_dtype, is_list_like, is_datetimelike, _ensure_int64, @@ -974,9 +975,14 @@ def _maybe_coerce_merge_keys(self): # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 - elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): + + # boolean values are considered as numeric, but are still allowed + # to be merged on object boolean values + elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) + and not is_numeric_dtype(rk)): raise ValueError(msg) - elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): + elif (not is_numeric_dtype(lk) + and (is_numeric_dtype(rk) and not is_bool_dtype(rk))): raise ValueError(msg) elif is_datetimelike(lk) and not is_datetimelike(rk): raise ValueError(msg) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8e639edd34b18..037bd9cc7cd18 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1526,6 +1526,27 @@ def test_merge_on_ints_floats_warning(self): result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']]) + def test_merge_incompat_infer_boolean_object(self): + # GH21119: bool + object bool merge OK + df1 = DataFrame({'key': Series([True, False], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + + # with missing value + df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + @pytest.mark.parametrize('df1_vals, df2_vals', [ ([0, 1, 2], ["0", "1", "2"]), ([0.0, 1.0, 2.0], ["0", "1", "2"]), @@ -1538,6 +1559,8 @@ def test_merge_on_ints_floats_warning(self): pd.date_range('20130101', periods=3, tz='US/Eastern')), ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + # TODO ([0, 1], pd.Series([False, True], dtype=bool)), + ([0, 1], pd.Series([False, True], dtype=object)) ]) def test_merge_incompat_dtypes(self, df1_vals, df2_vals): # GH 9780, GH 15800 From c2f21591bc104a1945a27e0666a829c0a6193a17 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 7 Jun 2018 18:05:57 -0400 Subject: [PATCH 050/116] BUG: dropna incorrect with categoricals in pivot_table (#21252) (cherry picked from commit abfac97b2d22447d41bfccaa53e0a264ca34d6d4) --- pandas/core/reshape/pivot.py | 20 ++++++++++++++++++-- pandas/tests/reshape/test_pivot.py | 26 +++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e02420323704e..9a2ad5d13d77a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,8 +1,10 @@ # pylint: disable=E1103 -from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_list_like, is_scalar, is_integer_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=dropna) + # group by the cartesian product of the grouper + # if we have a categorical + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how='all') + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in [v for v in values if v in data and v in agged]: + if (is_integer_dtype(data[v]) and + not is_integer_dtype(agged[v])): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..3ec60d50f2792 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta @@ -16,6 +17,11 @@ from pandas.api.types import CategoricalDtype as CDT +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + class TestPivotTable(object): def setup_method(self, method): @@ -109,7 +115,6 @@ def test_pivot_table_categorical(self): index=exp_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], + categories=['low', 'high'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) From 3723e80508883c954f67ffbd557991bf6465e114 Mon Sep 17 00:00:00 2001 From: ssikdar1 Date: Thu, 7 Jun 2018 11:58:47 -0400 Subject: [PATCH 051/116] Fix nested_to_record with None values in nested levels (#21164) (cherry picked from commit ab6aaf73a848a8725a23bb880be5221dd5ef5b3d) --- doc/source/whatsnew/v0.23.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 0017372add683..9cb21e8760262 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -117,6 +117,8 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) +- Bug when :meth:`pandas.io.json.json_normalize` was called with ``None`` values in nested levels in JSON (:issue:`21158`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) From daec0873ce6396d69d2dbb1f05d1f5740369b794 Mon Sep 17 00:00:00 2001 From: Stefano Cianciulli Date: Thu, 7 Jun 2018 12:23:32 +0100 Subject: [PATCH 052/116] Fix typo in error message in the PlanePlot class (#21350) (cherry picked from commit cea0a81b3d1ade61a5c662458dd8edc135dc94f6) --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 87b7d13251f28..d1a2121597dd6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -811,7 +811,7 @@ class PlanePlot(MPLPlot): def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: - raise ValueError(self._kind + ' requires and x and y column') + raise ValueError(self._kind + ' requires an x and y column') if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): From 2f1842abc7641f11834e226a6cf03b6a8b73a8b9 Mon Sep 17 00:00:00 2001 From: Max Kanter Date: Tue, 5 Jun 2018 07:08:30 -0400 Subject: [PATCH 053/116] Add Featuretools to Pandas Ecosystem Page (#21297) (cherry picked from commit 67e6e6fcd19d1d89cb60abc3a78372bc85fd8e29) --- doc/source/ecosystem.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 30cdb06b28487..6714398084186 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput Use pandas DataFrames in your `scikit-learn `__ ML pipeline. +`Featuretools `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. .. _ecosystem.visualization: From 0eb9bae207240b67404be6106b1d1c8402927f73 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Tue, 5 Jun 2018 05:54:30 +0100 Subject: [PATCH 054/116] BUG: Fix encoding error in to_csv compression (#21300) (cherry picked from commit b32fdc44206c38aecbbe5fdb4ed543a5d213ebb9) --- doc/source/whatsnew/v0.23.1.txt | 8 +++++++ pandas/io/formats/csvs.py | 36 ++++++++++++++++------------- pandas/tests/frame/test_to_csv.py | 38 ++++++++++++++++++++++--------- pandas/tests/series/test_io.py | 36 +++++++++++++++++++++-------- pandas/tests/test_common.py | 23 +++++++++++++++++++ 5 files changed, 104 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9cb21e8760262..05fad4b99919e 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -121,6 +121,14 @@ I/O - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) +- + +Plotting +^^^^^^^^ + +- +- +>>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300) Reshaping diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 29b8d29af0808..7f660e2644fa4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -9,6 +9,7 @@ import numpy as np from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -127,14 +128,19 @@ def save(self): else: encoding = self.encoding - if hasattr(self.path_or_buf, 'write'): - f = self.path_or_buf - close = False + # PR 21300 uses string buffer to receive csv writing and dump into + # file-like output with compression as option. GH 21241, 21118 + f = StringIO() + if not is_file_like(self.path_or_buf): + # path_or_buf is path + path_or_buf = self.path_or_buf + elif hasattr(self.path_or_buf, 'name'): + # path_or_buf is file handle + path_or_buf = self.path_or_buf.name else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=None) - close = True if self.compression is None else False + # path_or_buf is file-like IO objects. + f = self.path_or_buf + path_or_buf = None try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,18 +157,16 @@ def save(self): self._save() finally: - # GH 17778 handles compression for byte strings. - if not close and self.compression: - f.close() - with open(self.path_or_buf, 'r') as f: - data = f.read() - f, handles = _get_handle(self.path_or_buf, self.mode, + # GH 17778 handles zip compression for byte strings separately. + buf = f.getvalue() + if path_or_buf: + f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) - f.write(data) - close = True - if close: + f.write(buf) f.close() + for _fh in handles: + _fh.close() def _save_header(self): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..60dc336a85388 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression): - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + @pytest.mark.parametrize('df,encoding', [ + (DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']), None), + # GH 21241, 21118 + (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), + (DataFrame(5 * [[123, u"你好", u"世界"]], + columns=['X', 'Y', 'Z']), 'gb2312'), + (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], + columns=['X', 'Y', 'Z']), 'cp737') + ]) + def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: - df.to_csv(filename, compression=compression) + df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression, - index_col=0) - assert_frame_equal(df, rs) + result = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + + with open(filename, 'w') as fh: + df.to_csv(fh, compression=compression, encoding=encoding) + + result_fh = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + assert_frame_equal(df, result) + assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, index_col=0)) + assert_frame_equal(df, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..76dd4bc1f3d4a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -138,29 +138,45 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') + @pytest.mark.parametrize('s,encoding', [ + (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X'), None), + # GH 21241, 21118 + (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), + (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), + (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') + ]) + def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, header=True) + s.to_csv(filename, compression=compression, encoding=encoding, + header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression, - index_col=0, squeeze=True) - assert_series_equal(s, rs) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + + with open(filename, 'w') as fh: + s.to_csv(fh, compression=compression, encoding=encoding, + header=True) + + result_fh = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, + squeeze=True) + assert_series_equal(s, result) + assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, - squeeze=True)) + squeeze=True, + encoding=encoding)) class TestSeriesIO(TestData): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bb7ee1b911fee..3443331e3d4ba 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -241,3 +241,26 @@ def test_compression_size(obj, method, compression): getattr(obj, method)(filename, compression=None) uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_csv']) +def test_compression_size_fh(obj, method, compression_only): + + with tm.ensure_clean() as filename: + with open(filename, 'w') as fh: + getattr(obj, method)(fh, compression=compression_only) + assert not fh.closed + assert fh.closed + compressed = os.path.getsize(filename) + with tm.ensure_clean() as filename: + with open(filename, 'w') as fh: + getattr(obj, method)(fh, compression=None) + assert not fh.closed + assert fh.closed + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed From 34ab282ec1b25fb88c656f91ee3539d42b539f2d Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 4 Jun 2018 15:28:50 -0600 Subject: [PATCH 055/116] BUG: Allow IntervalIndex to be constructed from categorical data with appropriate dtype (#21254) (cherry picked from commit 686f6047312fe7671d8a5e1b2ffd1866f7c7a766) --- pandas/core/indexes/interval.py | 4 ++++ .../indexes/interval/test_construction.py | 23 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 8f8d8760583ce..eb9d7efc06c27 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values): ------- array """ + if is_categorical_dtype(values): + # GH 21243/21253 + values = np.array(values) + if isinstance(values, (list, tuple)) and len(values) == 0: # GH 19016 # empty lists/tuples get object dtype by default, but this is not diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 5fdf92dcb2044..b1711c3444586 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -6,8 +6,9 @@ from pandas import ( Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical, - date_range, timedelta_range, period_range, notna) + CategoricalIndex, date_range, timedelta_range, period_range, notna) from pandas.compat import lzip +from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import IntervalDtype import pandas.core.common as com import pandas.util.testing as tm @@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks): with tm.assert_raises_regex(TypeError, msg): constructor(**self.get_kwargs_from_breaks(breaks)) + @pytest.mark.parametrize('cat_constructor', [ + Categorical, CategoricalIndex]) + def test_constructor_categorical_valid(self, constructor, cat_constructor): + # GH 21243/21253 + if isinstance(constructor, partial) and constructor.func is Index: + # Index is defined to create CategoricalIndex from categorical data + pytest.skip() + + breaks = np.arange(10, dtype='int64') + expected = IntervalIndex.from_breaks(breaks) + + cat_breaks = cat_constructor(breaks) + result_kwargs = self.get_kwargs_from_breaks(cat_breaks) + result = constructor(**result_kwargs) + tm.assert_index_equal(result, expected) + def test_generic_errors(self, constructor): # filler input data to be used when supplying invalid kwargs filler = self.get_kwargs_from_breaks(range(10)) @@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): tuples = lzip(breaks[:-1], breaks[1:]) if isinstance(breaks, (list, tuple)): return {'data': tuples} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(tuples)} return {'data': com._asarray_tuplesafe(tuples)} def test_constructor_errors(self): @@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): if isinstance(breaks, list): return {'data': ivs} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(ivs)} return {'data': np.array(ivs, dtype=object)} def test_generic_errors(self, constructor): From 118c50142f4da491c94a2cd2233b52052d3572c6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jun 2018 18:41:49 +0200 Subject: [PATCH 056/116] DOC: update whatsnew 0.23.1 (#21387) (cherry picked from commit 0f521ab8eb6c78be92607beadbf6f2c1cbf681b7) --- doc/source/whatsnew/v0.23.1.txt | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 05fad4b99919e..b4d19e24ad392 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -88,24 +88,24 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -Groupby/Resample/Rolling +**Groupby/Resample/Rolling** - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) - Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) -Data-type specific +**Data-type specific** - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -Sparse +**Sparse** - Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) -Indexing +**Indexing** - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) @@ -113,7 +113,11 @@ Indexing - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -I/O +**Plotting** + +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) + +**I/O** - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) @@ -121,21 +125,13 @@ I/O - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) -- - -Plotting -^^^^^^^^ - -- -- ->>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300) -Reshaping +**Reshaping** - Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) - Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) -Other +**Other** - Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) - Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) From 998c25b2eb0b750e100aaebf5a040b2288972f7c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 10:10:57 -0500 Subject: [PATCH 057/116] Fixup whatsnew --- doc/source/whatsnew/v0.23.1.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index b4d19e24ad392..db25bcf8113f5 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -121,8 +121,6 @@ Bug Fixes - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- Bug when :meth:`pandas.io.json.json_normalize` was called with ``None`` values in nested levels in JSON (:issue:`21158`) -- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) From 94857def5257337b2b2a5be145d6cf16306f0719 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 10:41:44 -0500 Subject: [PATCH 058/116] Backport fixture --- pandas/conftest.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index e6c1b1b171045..d5f399c7cd63d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -105,6 +105,16 @@ def compression(request): return request.param +@pytest.fixture(params=['gzip', 'bz2', 'zip', + pytest.param('xz', marks=td.skip_if_no_lzma)]) +def compression_only(request): + """ + Fixture for trying common compression types in compression tests excluding + uncompressed case + """ + return request.param + + @pytest.fixture(scope='module') def datetime_tz_utc(): from datetime import timezone From 81d609caf6b3685f70ca75a388f8f0ccb1e95b7e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 11:42:55 -0500 Subject: [PATCH 059/116] DOC: 0.23.1 release (#21446) (cherry picked from commit ab668b0a56a9f2aee959bde787e9a0af4068d7a7) --- doc/source/release.rst | 49 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 32db2ff5ebb24..2f7eedfbe9a45 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,10 +37,57 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: https://pypi.org/project/pandas * Documentation: http://pandas.pydata.org +pandas 0.23.1 +------------- + +**Release date**: June 12, 2018 + +This is a minor release from 0.23.0 and includes a number of bug fixes and +performance improvements. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 30 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* Adam J. Stewart +* Adam Kim + +* Aly Sivji +* Chalmer Lowe + +* Damini Satya + +* Dr. Irv +* Gabe Fernando + +* Giftlin Rajaiah +* Jeff Reback +* Jeremy Schendel + +* Joris Van den Bossche +* Kalyan Gokhale + +* Kevin Sheppard +* Matthew Roeschke +* Max Kanter + +* Ming Li +* Pyry Kovanen + +* Stefano Cianciulli +* Tom Augspurger +* Uddeshya Singh + +* Wenhuan +* William Ayd +* chris-b1 +* gfyoung +* h-vetinari +* nprad + +* ssikdar1 + +* tmnhat2001 +* topper-123 +* zertrin + + pandas 0.23.0 ------------- -**Release date**: May 15, 2017 +**Release date**: May 15, 2018 This is a major release from 0.22.0 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number From fccc56c1719975dbb553ae9cad23a34d07d3dcd3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 12:39:04 -0500 Subject: [PATCH 060/116] DOC: include 0.23.1 whatsnew --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d61a98fe2dae4..eb9211d0ceb02 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.1.txt + .. include:: whatsnew/v0.23.0.txt .. include:: whatsnew/v0.22.0.txt From 1a23779f09abc6ebf908d66ee88b973b767e2e3c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 12 Jun 2018 12:42:08 -0500 Subject: [PATCH 061/116] RLS: 0.23.1 From 7cbebaf981e45648358943e626edc25677c2104d Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Thu, 21 Jun 2018 15:09:18 +0530 Subject: [PATCH 062/116] Update merging.rst (#21568) --- doc/source/merging.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 1161656731f88..4d7cd0bdadef7 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -279,7 +279,7 @@ need to be: Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame``s which don't have a meaningful index, you may wish to append +For ``DataFrame`` s which don't have a meaningful index, you may wish to append them and ignore the fact that they may have overlapping indexes. To do this, use the ``ignore_index`` argument: @@ -314,7 +314,7 @@ This is also a valid argument to :meth:`DataFrame.append`: Concatenating with mixed ndims ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame``s. The +You can concatenate a mix of ``Series`` and ``DataFrame`` s. The ``Series`` will be transformed to ``DataFrame`` with the column name as the name of the ``Series``. From 3b65b9572a1fc8a2b232544d4e194b7d9eacdaa6 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:15:29 -0700 Subject: [PATCH 063/116] DOC: Add 0.23.2 whatsnew template (#21433) (cherry picked from commit 879b15f3476d81d51f236d13684444579bafb8fd) --- doc/source/whatsnew/v0.23.2.txt | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.2.txt diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt new file mode 100644 index 0000000000000..ec2eddcfd4d41 --- /dev/null +++ b/doc/source/whatsnew/v0.23.2.txt @@ -0,0 +1,82 @@ +.. _whatsnew_0232: + +v0.23.2 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.23.2 + :local: + :backlinks: none + +.. _whatsnew_0232.enhancements: + +New features +~~~~~~~~~~~~ + + +.. _whatsnew_0232.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Reshaping +^^^^^^^^^ + +- +- + +Categorical +^^^^^^^^^^^ + +- From 22c5145861fcf21567e46dcb7fb608b08cdd66a1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:16:36 -0700 Subject: [PATCH 064/116] MAINT: More friendly error msg on Index overflow (#21377) * MAINT: More useful error msg on Index overflow Display a more friendly error message when there is an OverflowError during Index construction. Partially addresses gh-15832. * DOC: Clarify how Index.__new__ handles dtype Partially addresses gh-15823. (cherry picked from commit defdb34bafa3900069d399ce597c0abbd4a2b0cc) --- pandas/core/indexes/base.py | 12 +++++++++++- pandas/tests/indexes/test_base.py | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 90238af9b3632..5fdb8fc59deca 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -187,6 +187,9 @@ class Index(IndexOpsMixin, PandasObject): ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: object) + If dtype is None, we find the dtype that best fits the data. + If an actual dtype is provided, we coerce to that dtype if it's safe. + Otherwise, an error will be raised. copy : bool Make a copy of input ndarray name : object @@ -312,7 +315,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - data = np.array(data, copy=copy, dtype=dtype) + try: + data = np.array(data, copy=copy, dtype=dtype) + except OverflowError: + # gh-15823: a more user-friendly error message + raise OverflowError( + "the elements provided in the data cannot " + "all be casted to the dtype {dtype}" + .format(dtype=dtype)) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1e4dd2921b3f5..19acfb294762c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -455,6 +455,13 @@ def test_constructor_nonhashable_name(self, indices): tm.assert_raises_regex(TypeError, message, indices.set_names, names=renamed) + def test_constructor_overflow_int64(self): + # see gh-15832 + msg = ("the elements provided in the data cannot " + "all be casted to the dtype int64") + with tm.assert_raises_regex(OverflowError, msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', From 191767168dfa21639d16a16319245969a8e974ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jun 2018 09:54:11 +0200 Subject: [PATCH 065/116] DOC: follow 0.23.1 template for 0.23.2 whatsnew (#21435) (cherry picked from commit 1275f91b74d8a48671eb8e705807bf852a8806a8) --- doc/source/whatsnew/v0.23.2.txt | 36 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ec2eddcfd4d41..c636e73fbd6c2 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -10,16 +10,11 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0232.enhancements: -New features -~~~~~~~~~~~~ +.. _whatsnew_0232.fixed_regressions: - -.. _whatsnew_0232.deprecations: - -Deprecations -~~~~~~~~~~~~ +Fixed Regressions +~~~~~~~~~~~~~~~~~ - - @@ -43,40 +38,41 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +**Groupby/Resample/Rolling** + - - -Conversion -^^^^^^^^^^ +**Conversion** + - - -Indexing -^^^^^^^^ +**Indexing** - - -I/O -^^^ +**I/O** - - -Plotting -^^^^^^^^ +**Plotting** - - -Reshaping -^^^^^^^^^ +**Reshaping** - - -Categorical -^^^^^^^^^^^ +**Categorical** + +- + +**Other** - From 475c8bcfde52545b7f46d3035691f20487415160 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 13 Jun 2018 11:25:58 +0100 Subject: [PATCH 066/116] Fix tests fragile to PATH (#21453) (cherry picked from commit 7a49449b8c95fed027af1da35970743f23a93dff) --- pandas/tests/plotting/test_converter.py | 3 ++- pandas/tests/test_downstream.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 47cded19f5300..bb976a1e3e81c 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,4 +1,5 @@ import subprocess +import sys import pytest from datetime import datetime, date @@ -27,7 +28,7 @@ def test_register_by_default(self): "import pandas as pd; " "units = dict(matplotlib.units.registry); " "assert pd.Timestamp in units)'") - call = ['python', '-c', code] + call = [sys.executable, '-c', code] assert subprocess.check_call(call) == 0 def test_warns(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index afd7993fefc70..cf98cff97669a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -3,6 +3,7 @@ Testing that we work in the downstream packages """ import subprocess +import sys import pytest import numpy as np # noqa @@ -57,7 +58,7 @@ def test_xarray(df): def test_oo_optimizable(): # GH 21071 - subprocess.check_call(["python", "-OO", "-c", "import pandas"]) + subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) @tm.network From d4c48aaadfa2a6cbf2375631101b79752504f004 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 13 Jun 2018 03:51:41 -0700 Subject: [PATCH 067/116] BUG: Construct Timestamp with tz correctly near DST border (#21407) (cherry picked from commit bc4ccd7dfaceb92ac2c6dc345c1bc4489407108f) --- doc/source/whatsnew/v0.23.2.txt | 4 ++++ pandas/_libs/tslibs/conversion.pyx | 22 ++++--------------- pandas/tests/frame/test_timezones.py | 10 +++++++++ .../indexes/datetimes/test_construction.py | 9 ++++++++ .../indexes/datetimes/test_date_range.py | 14 ++++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 8 +++++++ 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index c636e73fbd6c2..1de44ffeb4160 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -73,6 +73,10 @@ Bug Fixes - +**Timezones** +- Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) +- Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) + **Other** - diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f4841e6abb7e8..3cbef82437544 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -347,25 +347,11 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, if tz is not None: tz = maybe_get_tz(tz) - # sort of a temporary hack if ts.tzinfo is not None: - if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): - ts = tz.normalize(ts) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = pydatetime_to_dt64(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= int(ts_offset.total_seconds() * 1e9) - tz_offset = get_utcoffset(tz, ts) - obj.value += int(tz_offset.total_seconds() * 1e9) - dt64_to_dtstruct(obj.value, &obj.dts) - obj.tzinfo = tz + # Convert the current timezone to the passed timezone + ts = ts.astimezone(tz) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo elif not is_utc(tz): ts = _localize_pydatetime(ts, tz) obj.value = pydatetime_to_dt64(ts, &obj.dts) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index fa589a0aa4817..3956968173070 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -133,3 +133,13 @@ def test_frame_reset_index(self, tz): xp = df.index.tz rs = roundtripped.index.tz assert xp == rs + + @pytest.mark.parametrize('tz', [None, 'America/New_York']) + def test_boolean_compare_transpose_tzindex_with_dst(self, tz): + # GH 19970 + idx = date_range('20161101', '20161130', freq='4H', tz=tz) + df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, + index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list('ab'), columns=idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index dae69a86910af..b138b79caac76 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -469,6 +469,15 @@ def test_constructor_with_non_normalized_pytz(self, tz): result = DatetimeIndex(['2010'], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), + Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), + ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 193804b66395b..ec37bbbcb6c02 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -278,6 +278,20 @@ def test_wom_len(self, periods): res = date_range(start='20110101', periods=periods, freq='WOM-1MON') assert len(res) == periods + def test_construct_over_dst(self): + # GH 20854 + pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=True) + pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=False) + expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), + pre_dst, + pst_dst] + expected = DatetimeIndex(expect_data) + result = date_range(start='2010-11-7', periods=3, + freq='H', tz='US/Pacific') + tm.assert_index_equal(result, expected) + class TestGenRangeGeneration(object): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ab87d98fca8eb..4689c7bea626f 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -528,6 +528,14 @@ def test_disallow_setting_tz(self, tz): with pytest.raises(AttributeError): ts.tz = tz + @pytest.mark.parametrize('offset', ['+0300', '+0200']) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), + tz='Europe/Helsinki') + result = Timestamp(expected, tz='Europe/Helsinki') + assert result == expected + class TestTimestamp(object): From 14e5f3d4e604a9abe1ebefe9a136b026add6a7fc Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 13 Jun 2018 15:24:01 +0200 Subject: [PATCH 068/116] BUG: fix get_indexer_non_unique with CategoricalIndex key (#21457) closes #21448 (cherry picked from commit 576d5c6b76e039a411a7cc4c0de29813e2de0149) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/indexes/base.py | 3 +++ pandas/core/indexes/category.py | 7 ++++++- pandas/tests/categorical/test_indexing.py | 20 +++++++++++++++++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 1de44ffeb4160..3e4326dea2ecc 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -51,7 +51,7 @@ Bug Fixes **Indexing** -- +- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5fdb8fc59deca..a85a0ea88855c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ is_dtype_equal, is_dtype_union_equal, is_object_dtype, + is_categorical, is_categorical_dtype, is_interval_dtype, is_period_dtype, @@ -3357,6 +3358,8 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = _ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..587090fa72def 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -598,7 +598,12 @@ def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): - target = target.categories + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return _ensure_platform_int(indexer), missing + target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py index 9c27b1101e5ca..cf7b5cfa55882 100644 --- a/pandas/tests/categorical/test_indexing.py +++ b/pandas/tests/categorical/test_indexing.py @@ -5,7 +5,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex from pandas.tests.categorical.common import TestCategorical @@ -103,3 +103,21 @@ def f(): s.categories = [1, 2] pytest.raises(ValueError, f) + + # Combinations of sorted/unique: + @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], + [1, 3, 3, 4], [1, 2, 2, 4]]) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, 'category', key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) From 2272ef4d7d99018f6f570317f7ec3a3d0cd92580 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Thu, 14 Jun 2018 15:53:14 +0530 Subject: [PATCH 069/116] CLN: Comparison methods for MultiIndex should have consistent behaviour for all nlevels (GH21149) (#21195) (cherry picked from commit a8738ba69cd817f7d57c8c25957d2a59621e875f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/indexes/test_multi.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 3e4326dea2ecc..0d3f9cb8dd3b6 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -52,6 +52,7 @@ Bug Fixes **Indexing** - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) +- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a85a0ea88855c..a2e237c8cc45d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -97,7 +97,8 @@ def cmp_method(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + from .multi import MultiIndex + if is_object_dtype(self) and not isinstance(self, MultiIndex): # don't pass MultiIndex with np.errstate(all='ignore'): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 182dbdf2cf4e4..df506ae9486ee 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -3295,3 +3295,20 @@ def test_duplicate_multiindex_labels(self): with pytest.raises(ValueError): ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + + def test_multiindex_compare(self): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = pd.MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = pd.Series([True, True]) + result = pd.Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = pd.Series([False, False]) + result = pd.Series(midx > midx) + tm.assert_series_equal(result, expected) From e4e48f8f34adcf1fe6e37ead4cfd2b0b55547f74 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 15 Jun 2018 11:21:36 -0600 Subject: [PATCH 070/116] BUG: Fix Series.nlargest for integer boundary values (#21432) (cherry picked from commit ec5956ed350d33ac2cee07bf9a24ea5315529443) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/conftest.py | 71 +++++++++ pandas/core/algorithms.py | 5 +- pandas/tests/frame/test_analytics.py | 78 +++++----- pandas/tests/series/test_analytics.py | 209 ++++++++++++++++++++++++++ 5 files changed, 321 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0d3f9cb8dd3b6..d839a72323c78 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -80,4 +80,5 @@ Bug Fixes **Other** +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/conftest.py b/pandas/conftest.py index d5f399c7cd63d..9d806a91f37f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -129,6 +129,14 @@ def join_type(request): return request.param +@pytest.fixture(params=['nlargest', 'nsmallest']) +def nselect_method(request): + """ + Fixture for trying all nselect methods + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -170,3 +178,66 @@ def string_dtype(request): * 'U' """ return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float32 + * float64 + """ + + return request.param + + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int8 + * int16 + * int32 + * int64 + """ + + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * uint8 + * uint16 + * uint32 + * uint64 + """ + + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + """ + + return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88bc497f9f22d..bcde32696c1ff 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1131,9 +1131,12 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, _, _ = _ensure_data(dropped.values) + arr, pandas_dtype, _ = _ensure_data(dropped.values) if method == 'nlargest': arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 if self.keep == 'last': arr = arr[::-1] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index d1a4a5f615b86..90d7c46f7554f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, product, PY35 +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, _np_version_under1p15) @@ -2240,54 +2240,49 @@ class TestNLargestNSmallest(object): # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize( - 'method, n, order', - product(['nsmallest', 'nlargest'], range(1, 11), - [['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c'], - - ])) - def test_n(self, df_strings, method, n, order): + @pytest.mark.parametrize('order', [ + ['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c']]) + @pytest.mark.parametrize('n', range(1, 11)) + def test_n(self, df_strings, nselect_method, n, order): # GH10393 df = df_strings if 'b' in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=method, dtype='object') + column='b', method=nselect_method, dtype='object') with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(n, order) + getattr(df, nselect_method)(n, order) else: - ascending = method == 'nsmallest' - result = getattr(df, method)(n, order) + ascending = nselect_method == 'nsmallest' + result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'method, columns', - product(['nsmallest', 'nlargest'], - product(['group'], ['category_string', 'string']) - )) - def test_n_error(self, df_main_dtypes, method, columns): + @pytest.mark.parametrize('columns', [ + ('group', 'category_string'), ('group', 'string')]) + def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes + col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=columns[1], method=method, dtype=df[columns[1]].dtype) + column=col, method=nselect_method, dtype=df[col].dtype) # escape some characters that may be in the repr error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(2, columns) + getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes @@ -2308,15 +2303,14 @@ def test_n_identical_values(self): expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'n, order', - product([1, 2, 3, 4, 5], - [['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']])) + @pytest.mark.parametrize('order', [ + ['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']]) + @pytest.mark.parametrize('n', range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ea40329f4bc3..7a78b562ac1fa 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1866,6 +1866,189 @@ def s_main_dtypes(): return df +class TestMode(object): + + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ]) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], [np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(result, expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object).astype(str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], [np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, 'foo', 'foo']) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([3, 1], categories=[3, 2, 1], ordered=True)), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(result, expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], + categories=[3, 2, 1], ordered=True)) + result = s.mode(dropna) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2**63, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(['foo', np.nan]) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + class TestNLargestNSmallest(object): @pytest.mark.parametrize( @@ -1950,6 +2133,32 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + def test_boundary_integer(self, nselect_method, any_int_dtype): + # GH 21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_boundary_float(self, nselect_method, float_dtype): + # GH 21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter( + [min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + def test_boundary_datetimelike(self, nselect_method, dtype): + # GH 21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo('int64') + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + class TestCategoricalSeriesAnalytics(object): From e9ee3a10f8d2eb0ef927e7ad5007fac6d64217ae Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 18 Jun 2018 23:42:59 +0200 Subject: [PATCH 071/116] PERF: remove useless overrides (#21523) closes #21522 (cherry picked from commit ea54d390ac69a4421f8e88810dd058e9894daf26) --- pandas/core/indexes/multi.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fbcf06a28c1e5..c8332d762f7ef 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -852,14 +852,6 @@ def _has_complex_internals(self): # to disable groupby tricks return True - @cache_readonly - def is_monotonic(self): - """ - return if the index is monotonic increasing (only equal or - increasing) values. - """ - return self.is_monotonic_increasing - @cache_readonly def is_monotonic_increasing(self): """ @@ -887,10 +879,6 @@ def is_monotonic_decreasing(self): # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def is_unique(self): - return not self.duplicated().any() - @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ From 76551c2540a51c028193a16843b7e6d9fcbe47ba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Jun 2018 17:39:39 -0500 Subject: [PATCH 072/116] BUG: Timedelta.__bool__ (#21485) Closes #21484 (cherry picked from commit d5a1232da14e86dea2b3db8b61741f3f9b56e55a) --- doc/source/whatsnew/v0.23.2.txt | 9 ++++++--- pandas/_libs/tslibs/timedeltas.pyx | 3 +++ pandas/tests/scalar/timedelta/test_timedelta.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index d839a72323c78..ea6d8620289f8 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -43,10 +43,13 @@ Bug Fixes - - -**Conversion** +**Timedelta** +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) -- +**Conversion** + +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - **Indexing** @@ -75,10 +78,10 @@ Bug Fixes - **Timezones** + - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) **Other** -- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e2b0b33053f83..769f3ca5fa8bf 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -899,6 +899,9 @@ cdef class _Timedelta(timedelta): def __str__(self): return self._repr_base(format='long') + def __bool__(self): + return self.value != 0 + def isoformat(self): """ Format Timedelta as ISO 8601 Duration like diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 205fdf49d3e91..6472bd4245622 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -588,3 +588,17 @@ def test_components(self): result = s.dt.components assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() + + +@pytest.mark.parametrize('value, expected', [ + (Timedelta('10S'), True), + (Timedelta('-10S'), True), + (Timedelta(10, unit='ns'), True), + (Timedelta(0, unit='ns'), False), + (Timedelta(-10, unit='ns'), True), + (Timedelta(None), True), + (pd.NaT, True), +]) +def test_truthiness(value, expected): + # https://github.com/pandas-dev/pandas/issues/21484 + assert bool(value) is expected From eb6f3681557f61aca378dd81ad92ff09fb05ad15 Mon Sep 17 00:00:00 2001 From: David Krych Date: Mon, 18 Jun 2018 18:43:27 -0400 Subject: [PATCH 073/116] BUG: Fix Index construction when given empty generator (#21470). (#21481) (cherry picked from commit 076635ac3a33b819f4ae0fb1f95106bf8e4bf329) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/arrays/categorical.py | 5 ++--- pandas/core/indexes/base.py | 10 ++++++---- pandas/tests/indexes/test_base.py | 19 +++++++++++-------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ea6d8620289f8..2af89c15bb8fb 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -49,8 +49,9 @@ Bug Fixes **Conversion** +- Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) - Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) -- + **Indexing** diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a1a8f098b582e..b587a4c0bc722 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -3,7 +3,6 @@ import numpy as np from warnings import warn import textwrap -import types from pandas import compat from pandas.compat import u, lzip @@ -28,7 +27,7 @@ is_categorical, is_categorical_dtype, is_list_like, is_sequence, - is_scalar, + is_scalar, is_iterator, is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d, take @@ -2473,7 +2472,7 @@ def _convert_to_list_like(list_like): if isinstance(list_like, list): return list_like if (is_sequence(list_like) or isinstance(list_like, tuple) or - isinstance(list_like, types.GeneratorType)): + is_iterator(list_like)): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2e237c8cc45d..4dacec6a93c68 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -436,12 +436,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if tupleize_cols and is_list_like(data) and data: + if tupleize_cols and is_list_like(data): + # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) - # we must be all tuples, otherwise don't construct - # 10697 - if all(isinstance(e, tuple) for e in data): + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 from .multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get('names')) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 19acfb294762c..a0d6907055a2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -419,21 +419,24 @@ def test_constructor_dtypes_timedelta(self, attr, klass): result = klass(list(values), dtype=dtype) tm.assert_index_equal(result, index) - def test_constructor_empty_gen(self): - skip_index_keys = ["repeats", "periodIndex", "rangeIndex", - "tuples"] - for key, index in self.generate_index_types(skip_index_keys): - empty = index.__class__([]) - assert isinstance(empty, index.__class__) - assert not len(empty) + @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) + @pytest.mark.parametrize("klass", + [Index, Float64Index, Int64Index, UInt64Index, + CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + def test_constructor_empty(self, value, klass): + empty = klass(value) + assert isinstance(empty, klass) + assert not len(empty) @pytest.mark.parametrize("empty,klass", [ (PeriodIndex([], freq='B'), PeriodIndex), + (PeriodIndex(iter([]), freq='B'), PeriodIndex), + (PeriodIndex((x for x in []), freq='B'), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ['blue', 'red']], labels=[[], []]), MultiIndex) ]) - def test_constructor_empty(self, empty, klass): + def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) From 2292005d0e780036939a258d09c8a6db16ecdd74 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 18 Jun 2018 23:45:25 +0100 Subject: [PATCH 074/116] BUG/REG: file-handle object handled incorrectly in to_csv (#21478) (cherry picked from commit 91451cb7dbaaf6fb3f9bdfca73fe6adc2ee68cce) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/io/common.py | 4 +++ pandas/io/formats/csvs.py | 59 ++++++++++++++++++++----------- pandas/tests/frame/test_to_csv.py | 16 +++++---- pandas/tests/series/test_io.py | 18 +++++----- pandas/tests/test_common.py | 34 +++++++++++++----- 6 files changed, 87 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 2af89c15bb8fb..e3205aecee121 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - .. _whatsnew_0232.performance: diff --git a/pandas/io/common.py b/pandas/io/common.py index a492b7c0b8e8e..ac9077f2db50e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) + @property + def closed(self): + return self.fp is None + class MMapWrapper(BaseIterator): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 7f660e2644fa4..60518f596e9af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,11 +5,13 @@ from __future__ import print_function +import warnings + import csv as csvlib +from zipfile import ZipFile import numpy as np from pandas.core.dtypes.missing import notna -from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -128,19 +130,31 @@ def save(self): else: encoding = self.encoding - # PR 21300 uses string buffer to receive csv writing and dump into - # file-like output with compression as option. GH 21241, 21118 - f = StringIO() - if not is_file_like(self.path_or_buf): - # path_or_buf is path - path_or_buf = self.path_or_buf - elif hasattr(self.path_or_buf, 'name'): - # path_or_buf is file handle - path_or_buf = self.path_or_buf.name - else: - # path_or_buf is file-like IO objects. + # GH 21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, 'write'): + msg = ("compression has no effect when passing file-like " + "object as input.") + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, 'write') + and self.compression == 'zip') + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH 21241, 21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf - path_or_buf = None + close = False + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -157,13 +171,18 @@ def save(self): self._save() finally: - # GH 17778 handles zip compression for byte strings separately. - buf = f.getvalue() - if path_or_buf: - f, handles = _get_handle(path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - f.write(buf) + if is_zip: + # GH 17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, 'write'): + self.path_or_buf.write(buf) + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(buf) + close = True + if close: f.close() for _fh in handles: _fh.close() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 60dc336a85388..3ad25ae73109e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,6 +9,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) - # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) + assert_frame_equal(df, result) - with open(filename, 'w') as fh: - df.to_csv(fh, compression=compression, encoding=encoding) - - result_fh = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_frame_equal(df, result) - assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 76dd4bc1f3d4a..90f37053ce17e 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame from pandas.compat import StringIO, u +from pandas.io.common import _get_handle from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -152,20 +153,19 @@ def test_to_csv_compression(self, s, encoding, compression): s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) - with open(filename, 'w') as fh: - s.to_csv(fh, compression=compression, encoding=encoding, - header=True) - - result_fh = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, - squeeze=True) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) - assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3443331e3d4ba..576239e49455e 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -11,6 +11,7 @@ from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops +from pandas.io.common import _get_handle import pandas.util.testing as tm @@ -248,19 +249,34 @@ def test_compression_size(obj, method, compression): [12.32112, 123123.2, 321321.2]], columns=['X', 'Y', 'Z']), Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv']) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=compression_only) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=None) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +# GH 21227 +def test_compression_warning(compression_only): + df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) From 030a0589cdc8479c65223669b5bbf0d10a95f31c Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Tue, 19 Jun 2018 13:26:48 +0200 Subject: [PATCH 075/116] BUG: Handle read_csv corner case (#21176) Closes gh-21141 (cherry picked from commit c2da06c8eea4cc0339717aa09acdd6765bc3d673) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/parsers.py | 12 +++++++++++- pandas/tests/io/parser/common.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index e3205aecee121..f7c04ba9cfa9f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -61,6 +61,7 @@ Bug Fixes **I/O** +- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8f98732c92f..65df2bffb4abf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3209,12 +3209,22 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): col = columns[k] if is_integer(k) else k dtype[col] = v - if index_col is None or index_col is False: + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic emtpy Index. + if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() + for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2b7ff1f5a9879..b39122e5e7906 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -238,6 +238,21 @@ def test_csv_mixed_type(self): out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) + def test_read_csv_low_memory_no_rows_with_index(self): + if self.engine == "c" and not self.low_memory: + pytest.skip("This is a low-memory specific test") + + # see gh-21141 + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + out = self.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(out, expected) + def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, From d44fddb12ff0ff3991dfaa81b52d8f63b0f3d308 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Wed, 20 Jun 2018 16:03:07 +0530 Subject: [PATCH 076/116] REGR: Fixes first_valid_index when DataFrame or Series has duplicate row index (GH21441) (#21497) (cherry picked from commit ec2020735d72ff73e0a6a607689281aad173c702) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/generic.py | 23 +++++++++++------------ pandas/tests/frame/test_timeseries.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index f7c04ba9cfa9f..7d870fefba651 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -17,7 +17,8 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) -- +- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e4eda1bc4dc7..b03e598dcc52c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8969,18 +8969,17 @@ def _find_valid_index(self, how): is_valid = is_valid.any(1) # reduce axis 1 if how == 'first': - # First valid value case - i = is_valid.idxmax() - if not is_valid[i]: - return None - return i - - elif how == 'last': - # Last valid value case - i = is_valid.values[::-1].argmax() - if not is_valid.iat[len(self) - i - 1]: - return None - return self.index[len(self) - i - 1] + idxpos = is_valid.values[::].argmax() + + if how == 'last': + idxpos = len(self) - 1 - is_valid.values[::-1].argmax() + + chk_notna = is_valid.iat[idxpos] + idx = self.index[idxpos] + + if not chk_notna: + return None + return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', 'klass': 'NDFrame'}) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 90fbc6e628369..fb9bd74d9876d 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self): actual_series = ts.asfreq(freq='1S', fill_value=9.0) assert_series_equal(expected_series, actual_series) - def test_first_last_valid(self): + @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ + ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), + ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), + ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), + ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) + def test_first_last_valid(self, data, idx, + expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = nan @@ -539,6 +547,11 @@ def test_first_last_valid(self): assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq='12h') result = ts.first('10d') From 172c5159ba7c1a1c0a398af4ee2ac77f00c1ef85 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Thu, 21 Jun 2018 05:54:23 +0300 Subject: [PATCH 077/116] BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21541) (cherry picked from commit f91a7049d1730aa1924584a07a1265d9f57a2f35) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/sorting.py | 29 ++++++++++++++++------------ pandas/tests/frame/test_analytics.py | 17 ++++++++++++++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 7d870fefba651..a1b71ba5cbc43 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -58,6 +58,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,21 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +88,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +97,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 90d7c46f7554f..4197339ff6e03 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1507,6 +1507,23 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From a2199d2c01241d325bbff9474a94c47a8a7a4b82 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 21 Jun 2018 09:13:01 +0100 Subject: [PATCH 078/116] BUG: Fix passing empty label to df drop (#21515) Closes #21494 (cherry picked from commit f4fba9e90f6a7e27af984acc77403139ef600d8f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/generic.py | 21 ++-- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/multi.py | 1 - .../tests/frame/test_axis_select_reindex.py | 15 +++ .../tests/series/indexing/test_alter_index.py | 106 ++++++++++++------ 6 files changed, 98 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index a1b71ba5cbc43..20d427335a47f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -58,6 +58,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) - Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b03e598dcc52c..612ee7cb42021 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3129,7 +3129,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): """ axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) - axis, axis_ = self._get_axis(axis), axis + axis = self._get_axis(axis) if axis.is_unique: if level is not None: @@ -3138,24 +3138,25 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) - dropped = self.reindex(**{axis_name: new_axis}) - try: - dropped.axes[axis_].set_names(axis.names, inplace=True) - except AttributeError: - pass - result = dropped + result = self.reindex(**{axis_name: new_axis}) + # Case for non-unique axis else: labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') indexer = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == 'raise' and indexer.all(): + raise KeyError('{} not found in axis'.format(labels)) else: indexer = ~axis.isin(labels) - - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == 'raise' and labels_missing: + raise KeyError('{} not found in axis'.format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4dacec6a93c68..59527afe6c1f7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4392,7 +4392,7 @@ def drop(self, labels, errors='raise'): Raises ------ KeyError - If none of the labels are found in the selected axis + If not all of the labels are found in the selected axis """ arr_dtype = 'object' if self.dtype == 'object' else None labels = com._index_labels_to_array(labels, dtype=arr_dtype) @@ -4401,7 +4401,7 @@ def drop(self, labels, errors='raise'): if mask.any(): if errors != 'ignore': raise KeyError( - 'labels %s not contained in axis' % labels[mask]) + '{} not found in axis'.format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c8332d762f7ef..80bf73cfe7dd3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1707,7 +1707,6 @@ def drop(self, labels, level=None, errors='raise'): if errors != 'ignore': raise ValueError('labels %s not contained in axis' % labels[mask]) - indexer = indexer[~mask] except Exception: pass diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 28e82f7585850..0e0d6598f5101 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1151,3 +1151,18 @@ def test_raise_on_drop_duplicate_index(self, actual): expected_no_err = actual.T.drop('c', axis=1, level=level, errors='ignore') assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH 21494 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 999ed5f26daee..2fdf198596ce2 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -463,54 +463,86 @@ def test_rename(): assert result.name == expected.name -def test_drop(): - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, expected_data, expected_index', + [ + # Unique Index + ([1, 2], ['one', 'two'], ['two'], + 0, [1], ['one']), + ([1, 2], ['one', 'two'], ['two'], + 'rows', [1], ['one']), + ([1, 1, 2], ['one', 'two', 'one'], ['two'], + 0, [1, 2], ['one', 'one']), + + # GH 5248 Non-Unique Index + ([1, 1, 2], ['one', 'two', 'one'], 'two', + 0, [1, 2], ['one', 'one']), + ([1, 1, 2], ['one', 'two', 'one'], ['one'], + 0, [1], ['two']), + ([1, 1, 2], ['one', 'two', 'one'], 'one', + 0, [1], ['two'])]) +def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, + expected_data, expected_index): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a',)) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, error_type, error_desc', + [ + # single string/tuple-like + (range(3), list('abc'), 'bc', + 0, KeyError, 'not found in axis'), + + # bad axis + (range(3), list('abc'), ('a',), + 0, KeyError, 'not found in axis'), + (range(3), list('abc'), 'one', + 'columns', ValueError, 'No axis named columns')]) +def test_drop_exception_raised(data, index, drop_labels, + axis, error_type, error_desc): + + with tm.assert_raises_regex(error_type, error_desc): + Series(data, index=index).drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) + tm.assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') + tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) +@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index).drop(drop_labels) + tm.assert_series_equal(series, pd.Series(index=expected_index)) + + +@pytest.mark.parametrize('data, index, drop_labels', [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]) +]) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.Series(data=data, index=index).drop(drop_labels) From 4b1a68776aa20bb2dc081bb77093adb6c47957f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Jun 2018 03:18:53 -0700 Subject: [PATCH 079/116] fix hashing string-casting error (#21187) (cherry picked from commit e24da6c9f92d2b04ffb39a7fe0db85015af7ff3f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/_libs/hashing.pyx | 7 ++----- pandas/tests/series/test_repr.py | 30 ++++++++++++++++++++++++++++++ pandas/util/testing.py | 22 ++++++++++++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 20d427335a47f..60376f416aeb7 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -80,6 +80,7 @@ Bug Fixes **Categorical** +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) - **Timezones** diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -202,6 +203,35 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected + repr(ser) + str(ser) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + with tm.set_defaultencoding('ascii'): + repr(ser) + str(ser) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 233eba6490937..6384eca9849f6 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize): # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked From 2d2f6aa9e368e3d97d8a8d24a802357e4ac3a919 Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Sat, 23 Jun 2018 01:04:38 +0200 Subject: [PATCH 080/116] add test case when to_csv argument is sys.stdout (#21572) (cherry picked from commit 66fea91e915ca5e3f096055f3ad0f07335483e3f) --- pandas/tests/io/formats/test_to_csv.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index dfa3751bff57a..36c4ae547ad4e 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,3 +285,18 @@ def test_to_csv_string_array_utf8(self): df.to_csv(path, encoding='utf-8') with open(path, 'r') as f: assert f.read() == expected_utf8 + + @tm.capture_stdout + def test_to_csv_stdout_file(self): + # GH 21561 + df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], + columns=['name_1', 'name_2']) + expected_ascii = '''\ +,name_1,name_2 +0,foo,bar +1,baz,qux +''' + df.to_csv(sys.stdout, encoding='ascii') + output = sys.stdout.getvalue() + assert output == expected_ascii + assert not sys.stdout.closed From cf0a55f86eb73782d0d76cc9208ca56d374c9a5e Mon Sep 17 00:00:00 2001 From: Vu Le Date: Sat, 23 Jun 2018 06:07:21 +0700 Subject: [PATCH 081/116] BUG: Fix json_normalize throwing TypeError (#21536) (#21540) (cherry picked from commit 5fdaa9717f7550c5293d421205bfa19011278396) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/io/json/normalize.py | 8 +++++++- pandas/tests/io/json/test_normalize.py | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 60376f416aeb7..53ca4c0d1c144 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -65,7 +65,7 @@ Bug Fixes **I/O** - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) -- +- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) - **Plotting** diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index b845a43b9ca9e..2004a24c2ec5a 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -170,6 +170,11 @@ def json_normalize(data, record_path=None, meta=None, 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH + >>> data = {'A': [1, 2]} + >>> json_normalize(data, 'A', record_prefix='Prefix.') + Prefix.0 + 0 1 + 1 2 """ def _pull_field(js, spec): result = js @@ -259,7 +264,8 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) + result = result.rename( + columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 395c2c90767d3..200a853c48900 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -123,6 +123,12 @@ def test_simple_normalize_with_separator(self, deep_nested): 'country', 'states_name']).sort_values() assert result.columns.sort_values().equals(expected) + def test_value_array_record_prefix(self): + # GH 21536 + result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.') + expected = DataFrame([[1], [2]], columns=['Prefix.0']) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], From 176695fde32e872478d303ab21965bd49416aae4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 13:48:59 +0200 Subject: [PATCH 082/116] Remove incorrectly added TestMode class The tests were incorrectly added from https://github.com/pandas-dev/pandas/commit/f1631bec96dd9a1dc4890677b9c5475d0677e102#diff-dc347bc3d0448ea297bed67dc7ff3437 when fixing merge conflicts during cherry-picking --- pandas/tests/series/test_analytics.py | 174 -------------------------- 1 file changed, 174 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 7a78b562ac1fa..1e6ea96a5de51 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1866,180 +1866,6 @@ def s_main_dtypes(): return df -class TestMode(object): - - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) - def test_mode_empty(self, dropna, expected): - s = Series([], dtype=np.float64) - result = s.mode(dropna) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) - @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) - result = s.mode(dropna) - expected = Series(expected, dtype=dt) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) - def test_mode_numerical_nan(self, dropna, expected): - s = Series([1, 1, 2, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): - # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 - - s = Series(data, dtype='c') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(result, expected1) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected2) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object).astype(str) - result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) - def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) - result = s.mode(dropna) - expected = Series(expected1) - tm.assert_series_equal(result, expected) - - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) - def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) - def test_mode_timedelta(self, dropna, expected1, expected2): - # gh-5986: Test timedelta types. - - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) - def test_mode_category(self, dropna, expected1, expected2, expected3): - s = Series(Categorical([1, 2, np.nan, np.nan])) - result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(result, expected1) - - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) - result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(result, expected2) - - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) - result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) - def test_mode_intoverflow(self, dropna, expected1, expected2): - # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected1 = Series(expected1, dtype=np.uint64) - tm.assert_series_equal(result, expected1) - - s = Series([1, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=np.uint64) - tm.assert_series_equal(result, expected2) - - @pytest.mark.skipif(not compat.PY3, reason="only PY3") - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) From 8c7996d2211a95cf67ff2d465dd3c1517b90a310 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 17:25:33 +0200 Subject: [PATCH 083/116] DOC: fix spaces in 0.23.1 whatsnew file Take from https://github.com/pandas-dev/pandas/commit/e92b78603e1404e49d6bcb19873d2d24225a8e50 (could not be cherry-picked in its totality) --- doc/source/whatsnew/v0.23.1.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db25bcf8113f5..a52ba22cf36d2 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -97,8 +97,8 @@ Bug Fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) -- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) +- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) **Sparse** @@ -110,12 +110,12 @@ Bug Fixes - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, :issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) **Plotting** -- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue:`20968`) **I/O** From d0f664a20d581919b6d5d6efef9704e540b013b8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Jun 2018 09:57:43 -0500 Subject: [PATCH 084/116] CI: Test against Python 3.7 (#21604) (cherry picked from commit 7829ad3290dc6894d24c1c853ffc4dabef50294a) --- .travis.yml | 5 +++++ ci/travis-37.yaml | 14 ++++++++++++++ doc/source/install.rst | 2 +- doc/source/whatsnew/v0.23.2.txt | 6 ++++++ pandas/compat/__init__.py | 9 +++++---- pandas/tests/tseries/offsets/test_offsets.py | 10 ++++++++-- setup.py | 1 + 7 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 ci/travis-37.yaml diff --git a/.travis.yml b/.travis.yml index 4e25380a7d941..2d2a0bc019c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,11 @@ matrix: language: generic env: - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty + env: + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 0000000000000..8b255c9e6ec72 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,14 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist diff --git a/doc/source/install.rst b/doc/source/install.rst index 6054be112f52c..846170f9f0fa5 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -43,7 +43,7 @@ For more information, see the `Python 3 statement`_ and the `Porting to Python 3 Python version support ---------------------- -Officially Python 2.7, 3.5, and 3.6. +Officially Python 2.7, 3.5, 3.6, and 3.7. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 53ca4c0d1c144..5d196c4fe8d15 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -6,6 +6,12 @@ v0.23.2 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. note:: + + Pandas 0.23.2 is first pandas release that's compatible with + Python 3.7 (:issue:`20552`) + + .. contents:: What's new in v0.23.2 :local: :backlinks: none diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ae22694d0da7..28a55133e68aa 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -40,10 +40,11 @@ from collections import namedtuple PY2 = sys.version_info[0] == 2 -PY3 = (sys.version_info[0] >= 3) -PY35 = (sys.version_info >= (3, 5)) -PY36 = (sys.version_info >= (3, 6)) -PYPY = (platform.python_implementation() == 'PyPy') +PY3 = sys.version_info[0] >= 3 +PY35 = sys.version_info >= (3, 5) +PY36 = sys.version_info >= (3, 6) +PY37 = sys.version_info >= (3, 7) +PYPY = platform.python_implementation() == 'PyPy' try: import __builtin__ as builtins diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5369b1a94a956..0c08d813a7f1b 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -528,7 +528,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * BusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -1642,7 +1645,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * CustomBusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): diff --git a/setup.py b/setup.py index 90ec8e91a0700..c5831eb097767 100755 --- a/setup.py +++ b/setup.py @@ -217,6 +217,7 @@ def build_extensions(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Cython', 'Topic :: Scientific/Engineering'] From 684a4bda53cd37d7972162cfe5a582966cc1b070 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 02:34:15 -0500 Subject: [PATCH 085/116] API/COMPAT: support axis=None for logical reduction (reduce over all axes) (#21486) * Compat with NumPy 1.15 logical func * Accepts axis=None as reduce all dims (cherry picked from commit f7ed7f8e30e7418b346831c73e2f4541b7ae11be) --- doc/source/whatsnew/v0.23.2.txt | 30 +++++++ pandas/core/frame.py | 22 ++++- pandas/core/generic.py | 44 ++++++---- pandas/core/panel.py | 17 +++- pandas/core/series.py | 3 +- pandas/tests/frame/test_analytics.py | 119 +++++++++++++++++++++++++-- pandas/tests/test_panel.py | 7 ++ pandas/util/_test_decorators.py | 4 + 8 files changed, 215 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5d196c4fe8d15..f5a520216b2be 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,6 +16,36 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0232.enhancements: + +Logical Reductions over Entire DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) + df.all(axis=None) + + +This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. +With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis: + +.. code-block:: python + + >>> # NumPy 1.15, pandas 0.23.1 + >>> np.any(pd.DataFrame({"A": [False], "B": [False]})) + A False + B False + dtype: bool + +With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15. + +.. ipython:: python + + np.any(pd.DataFrame({"A": [False], "B": [False]})) + .. _whatsnew_0232.fixed_regressions: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9f6e834f0a25f..2a40dd28a6fd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6844,13 +6844,18 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - axis = self._get_axis_number(axis) + if axis is None and filter_type == 'bool': + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - labels = self._get_agg_axis(axis) - # exclude timedelta/datetime unless we are uniform types if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True @@ -6859,6 +6864,13 @@ def f(x): try: values = self.values result = f(values) + + if (filter_type == 'bool' and is_object_dtype(values) and + axis is None): + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) except Exception as e: # try by-column first @@ -6925,7 +6937,9 @@ def f(x): if axis == 0: result = coerce_to_dtypes(result, self.dtypes) - return Series(result, index=labels) + if constructor is not None: + result = Series(result, index=labels) + return result def nunique(self, axis=0, dropna=True): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 612ee7cb42021..50a5c10a6865f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8729,6 +8729,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) @@ -9055,8 +9057,15 @@ def _doc_parms(cls): Parameters ---------- -axis : int, default 0 - Select the axis which can be 0 for indices and 1 for columns. +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -9078,9 +9087,9 @@ def _doc_parms(cls): %(examples)s""" _all_doc = """\ -Return whether all elements are True over series or dataframe axis. +Return whether all elements are True, potentially over an axis. -Returns True if all elements within a series or along a dataframe +Returns True if all elements within a series or along a Dataframe axis are non-zero, not-empty or not-False.""" _all_examples = """\ @@ -9093,7 +9102,7 @@ def _doc_parms(cls): >>> pd.Series([True, False]).all() False -Dataframes +DataFrames Create a dataframe from a dictionary. @@ -9110,12 +9119,17 @@ def _doc_parms(cls): col2 False dtype: bool -Adding axis=1 argument will check if row-wise values all return True. +Specify ``axis='columns'`` to check if row-wise values all return True. ->>> df.all(axis=1) +>>> df.all(axis='columns') 0 True 1 False dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False """ _all_see_also = """\ @@ -9481,6 +9495,11 @@ def _doc_parms(cls): 1 False dtype: bool +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + `any` for an empty DataFrame is an empty Series. >>> pd.DataFrame([]).any() @@ -9651,22 +9670,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) - def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number if level is not None: if bool_only is not None: raise NotImplementedError("Option bool_only is not " "implemented with option level.") return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce(f, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool', - name=name) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') return set_function_name(logical_func, name, cls) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 16e64192fdb20..bad0dd79aaedd 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) + if axis is None and filter_type == 'bool': + # labels = None + # constructor = None + axis_number = None + axis_name = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + # labels = self._get_agg_axis(axis) + # constructor = self._constructor + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) with np.errstate(all='ignore'): result = f(self.values) + if axis is None and filter_type == 'bool': + return np.bool_(result) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: result = result.T diff --git a/pandas/core/series.py b/pandas/core/series.py index 6975dd8fc918e..6b005c673c7cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3212,7 +3212,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4197339ff6e03..437d3a9d24730 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,7 @@ from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15) + _np_version_under1p12) import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms @@ -1139,11 +1139,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) - df = DataFrame(randn(10, 4)) > 0 - df.any(1) - df.all(1) - df.any(1, bool_only=True) - df.all(1, bool_only=True) + def test_any_all_extra(self): + df = DataFrame({ + 'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, True, True], + }, index=['a', 'b', 'c']) + result = df[['A', 'B']].any(1) + expected = Series([True, True, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df[['A', 'B']].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[['C']].all(axis=None).item() + assert result is True # skip pathological failure cases # class CantNonzero(object): @@ -1165,6 +1189,86 @@ def test_any_all(self): # df.any(1, bool_only=True) # df.all(1, bool_only=True) + @pytest.mark.parametrize('func, data, expected', [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {'A': []}, False), + (np.all, {'A': []}, True), + (np.any, {'A': [False, False]}, False), + (np.all, {'A': [False, False]}, False), + (np.any, {'A': [True, False]}, True), + (np.all, {'A': [True, False]}, False), + (np.any, {'A': [True, True]}, True), + (np.all, {'A': [True, True]}, True), + + (np.any, {'A': [False], 'B': [False]}, False), + (np.all, {'A': [False], 'B': [False]}, False), + + (np.any, {'A': [False, False], 'B': [False, True]}, True), + (np.all, {'A': [False, False], 'B': [False, True]}, False), + + # other types + (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), + (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), + (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), + (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), + (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), + (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), + (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), + + # # Mix + # GH-21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ]) + def test_any_all_np_func(self, func, data, expected): + # https://github.com/pandas-dev/pandas/issues/19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # https://github.com/pandas-dev/pandas/issues/19976 + result = np.all(DataFrame(columns=['a', 'b'])).item() + assert result is True + + result = np.any(DataFrame(columns=['a', 'b'])).item() + assert result is False + + @pytest.mark.parametrize('method', ['any', 'all']) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], + names=['out', 'in']) + ) + xpr = "Must specify 'axis' when aggregating by level." + with tm.assert_raises_regex(ValueError, xpr): + getattr(df, method)(axis=None, level='out') + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: @@ -2071,9 +2175,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) - @pytest.mark.xfail( - not _np_version_under1p15, - reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 7973b27601237..128ab0572ba55 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2717,3 +2717,10 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) + + +def test_panel_np_all(): + with catch_warnings(record=True): + wp = Panel({"A": DataFrame({'b': [1, 2]})}) + result = np.all(wp) + assert result == np.bool_(True) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 89d90258f58e0..27c24e3a68079 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -30,6 +30,7 @@ def test_foo(): from pandas.compat import (is_platform_windows, is_platform_32bit, PY3, import_lzma) +from pandas.compat.numpy import _np_version_under1p15 from pandas.core.computation.expressions import (_USE_NUMEXPR, _NUMEXPR_INSTALLED) @@ -160,6 +161,9 @@ def decorated_func(func): skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") + +skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, + reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), From 01bb92127abd4a23005e780eb1e9b09cacfbb748 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 26 Jun 2018 08:26:21 -0400 Subject: [PATCH 086/116] TST: xfail flaky 3.7 test, xref #21636 (#21637) (cherry picked from commit dbd102c863adb36d07b999c2fc26403717c4bc32) --- pandas/tests/groupby/test_categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..0fec6a8f96a24 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.compat import PY37 from pandas import (Index, MultiIndex, CategoricalIndex, DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -205,6 +206,7 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) +@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636") @pytest.mark.parametrize('ordered', [True, False]) def test_apply(ordered): # GH 10138 From 417e87372831c4c5f906a99e19227e1d5ab7d2b3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 10:02:17 -0500 Subject: [PATCH 087/116] PKG: Exclude data test files. (#19535) (cherry picked from commit 36422a88474396148bd7d5d38aa238ea844d9555) --- MANIFEST.in | 34 ++++--- ci/script_single.sh | 8 +- doc/source/whatsnew/v0.23.2.txt | 5 + pandas/conftest.py | 42 +++++++++ pandas/tests/indexes/test_multi.py | 8 +- pandas/tests/io/conftest.py | 21 ++--- pandas/tests/io/formats/test_format.py | 4 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 8 +- pandas/tests/io/parser/common.py | 25 +++-- pandas/tests/io/parser/compression.py | 4 +- pandas/tests/io/parser/dtypes.py | 6 +- pandas/tests/io/parser/test_network.py | 53 +++++------ pandas/tests/io/parser/test_parsers.py | 6 +- pandas/tests/io/parser/test_textreader.py | 5 +- pandas/tests/io/sas/test_sas7bdat.py | 43 ++++----- pandas/tests/io/sas/test_xport.py | 6 +- pandas/tests/io/test_common.py | 54 +++++------ pandas/tests/io/test_excel.py | 12 +-- pandas/tests/io/test_html.py | 92 +++++++++++-------- pandas/tests/io/test_packers.py | 51 +++++----- pandas/tests/io/test_pickle.py | 38 ++++---- pandas/tests/io/test_pytables.py | 23 +++-- pandas/tests/io/test_sql.py | 63 +++++++------ pandas/tests/io/test_stata.py | 9 +- pandas/tests/plotting/common.py | 5 - pandas/tests/plotting/test_deprecated.py | 5 +- pandas/tests/plotting/test_misc.py | 16 ++-- pandas/tests/reshape/merge/test_merge_asof.py | 33 +++---- pandas/tests/reshape/test_tile.py | 6 +- pandas/tests/tseries/offsets/test_offsets.py | 16 ++-- pandas/tests/util/test_testing.py | 13 +++ pandas/util/_test_decorators.py | 1 - pandas/util/testing.py | 10 -- setup.cfg | 3 +- setup.py | 6 +- 36 files changed, 393 insertions(+), 347 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e0..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/ci/script_single.sh b/ci/script_single.sh index f376c920ac71b..60e2fbb33ee5d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -25,12 +25,12 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index f5a520216b2be..b3da4d1c4e288 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -70,6 +70,11 @@ Documentation Changes - - +Build Changes +------------- + +- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) + .. _whatsnew_0232.bug_fixes: Bug Fixes diff --git a/pandas/conftest.py b/pandas/conftest.py index 9d806a91f37f7..ead357747666d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,5 +1,8 @@ +import os + import pytest +import pandas import numpy as np import pandas as pd from pandas.compat import PY3 @@ -15,6 +18,8 @@ def pytest_addoption(parser): help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption("--strict-data-files", action="store_true", + help="Fail if a test is skipped for missing data file.") def pytest_runtest_setup(item): @@ -129,6 +134,43 @@ def join_type(request): return request.param +@pytest.fixture +def datapath(request): + """Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path : path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + def deco(*args): + path = os.path.join('pandas', 'tests', *args) + if not os.path.exists(path): + if request.config.getoption("--strict-data-files"): + msg = "Could not find file {} and --strict-data-files is set." + raise ValueError(msg.format(path)) + else: + msg = "Could not find {}." + pytest.skip(msg.format(path)) + return path + return deco + + +@pytest.fixture +def iris(datapath): + """The iris dataset as a DataFrame.""" + return pandas.read_csv(datapath('data', 'iris.csv')) + + @pytest.fixture(params=['nlargest', 'nsmallest']) def nselect_method(request): """ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index df506ae9486ee..3ede83b5969ce 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1181,12 +1181,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] assert result == expected - def test_legacy_pickle(self): + def test_legacy_pickle(self, datapath): if PY3: pytest.skip("testing for legacy pickles not " "support on py3") - path = tm.get_data_path('multiindex_v1.pickle') + path = datapath('indexes', 'data', 'multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1202,10 +1202,10 @@ def test_legacy_pickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) - def test_legacy_v2_unpickle(self): + def test_legacy_v2_unpickle(self, datapath): # 0.7.3 -> 0.8.0 format manage - path = tm.get_data_path('mindex_073.pickle') + path = datapath('indexes', 'data', 'mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8deb51e190bab..7623587803b41 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,32 +1,23 @@ -import os - import pytest from pandas.io.parsers import read_table -from pandas.util import testing as tm - - -@pytest.fixture -def parser_data(request): - return os.path.join(tm.get_data_path(), '..', 'parser', 'data') @pytest.fixture -def tips_file(parser_data): +def tips_file(datapath): """Path to the tips dataset""" - return os.path.join(parser_data, 'tips.csv') + return datapath('io', 'parser', 'data', 'tips.csv') @pytest.fixture -def jsonl_file(parser_data): +def jsonl_file(datapath): """Path a JSONL dataset""" - return os.path.join(parser_data, 'items.jsonl') + return datapath('io', 'parser', 'data', 'items.jsonl') @pytest.fixture -def salaries_table(parser_data): +def salaries_table(datapath): """DataFrame with the salaries dataset""" - path = os.path.join(parser_data, 'salaries.csv') - return read_table(path) + return read_table(datapath('io', 'parser', 'data', 'salaries.csv')) @pytest.fixture diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f221df93dd412..63b7cb3459069 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) - def test_string_repr_encoding(self): - filepath = tm.get_data_path('unicode_series.csv') + def test_string_repr_encoding(self, datapath): + filepath = datapath('io', 'formats', 'data', 'unicode_series.csv') df = pd.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..05ceace20f5a4 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,11 +21,11 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_read_zipped_json(): - uncompressed_path = tm.get_data_path("tsframe_v012.json") +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) - compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e497c395266f..bcbac4400c953 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -37,8 +37,9 @@ class TestPandasContainer(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(scope="function", autouse=True) + def setup(self, datapath): + self.dirpath = datapath("io", "json", "data") self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -59,7 +60,8 @@ def setup_method(self, method): self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() - def teardown_method(self, method): + yield + del self.dirpath del self.ts diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index b39122e5e7906..fb510f1a74556 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -77,7 +77,7 @@ def test_read_csv(self): else: prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) + fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): @@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self): tm.assert_frame_equal(df, expected) @tm.network - def test_url(self): + def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow - def test_file(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + def test_file(self, datapath): + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: @@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') + def test_utf16_example(self, datapath): + path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -767,8 +765,8 @@ def test_utf16_example(self): result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') + def test_unicode_encoding(self, datapath): + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) - def test_sub_character(self): + def test_sub_character(self, datapath): # see gh-16893 - dirpath = tm.get_data_path() - filename = os.path.join(dirpath, "sub_char.csv") + filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index e84db66561c49..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) - def test_read_csv_compressed_utf16_example(self): + def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 - path = tm.get_data_path('utf16_ex_small.zip') + path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') result = self.read_csv(path, encoding='utf-16', compression='zip', sep='\t') diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index b91ce04673e29..8060ebf2fbcd4 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -125,9 +125,9 @@ def test_categorical_dtype_high_cardinality_numeric(self): np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) - def test_categorical_dtype_encoding(self): + def test_categorical_dtype_encoding(self, datapath): # GH 10153 - pth = tm.get_data_path('unicode_series.csv') + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) @@ -135,7 +135,7 @@ def test_categorical_dtype_encoding(self): dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) - pth = tm.get_data_path('utf16_ex.txt') + pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..e2243b8087a5b 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -48,10 +48,16 @@ def check_compressed_urls(salaries_table, compression, extension, mode, tm.assert_frame_equal(url_table, salaries_table) +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + + @pytest.mark.usefixtures("s3_resource") class TestS3(object): - def test_parse_public_s3_bucket(self): + def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') # more of an integration test due to the not-public contents portion # can probably mock this though. @@ -60,45 +66,40 @@ def test_parse_public_s3_bucket(self): ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) @@ -109,14 +110,13 @@ def test_parse_public_s3_bucket_chunked(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, @@ -127,36 +127,33 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, tips_df): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 7717102b64fc5..b6f13039641a2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import pytest import pandas.util.testing as tm from pandas import read_csv, read_table, DataFrame @@ -45,8 +46,9 @@ def read_table(self, *args, **kwargs): def float_precision_choices(self): raise com.AbstractMethodError(self) - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e8d9d8b52164b..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -28,8 +28,9 @@ class TestTextReader(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b80263021c269..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -11,8 +11,9 @@ class TestSAS7BDAT(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: @@ -123,9 +124,8 @@ def test_iterator_read_too_much(self): rdr.close() -def test_encoding_options(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test1.sas7bdat") +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: @@ -143,43 +143,39 @@ def test_encoding_options(): assert(x == y.decode()) -def test_productsales(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "productsales.sas7bdat") +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') - fname = os.path.join(dirpath, "productsales.csv") + fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) -def test_12659(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test_12659.sas7bdat") +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "test_12659.csv") + fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) -def test_airline(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "airline.sas7bdat") +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "airline.csv") + fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) -def test_date_time(): +def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "datetime.sas7bdat") + fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "datetime.csv") + fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors @@ -187,9 +183,8 @@ def test_date_time(): tm.assert_frame_equal(df, df0) -def test_zero_variables(): +def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "zero_variables.sas7bdat") + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index de31c3e36a8d5..6e5b2ab067aa5 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,3 +1,4 @@ +import pytest import pandas as pd import pandas.util.testing as tm from pandas.io.sas.sasreader import read_sas @@ -18,8 +19,9 @@ def numeric_as_float(data): class TestXport(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a89156db38ae3..5c9739be73393 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -149,27 +149,22 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): reader(path) @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), + (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), + (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), + (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), + (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), + (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) - def test_read_fspath_all(self, reader, module, path): + def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) + path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) @@ -232,13 +227,14 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) -class TestMMapWrapper(object): +@pytest.fixture +def mmap_file(datapath): + return datapath('io', 'data', 'test_mmap.csv') + - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') +class TestMMapWrapper(object): - def test_constructor_bad_file(self): + def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 @@ -252,15 +248,15 @@ def test_constructor_bad_file(self): tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - target = open(self.mmap_file, 'r') + target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: + def test_get_attr(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -273,8 +269,8 @@ def test_get_attr(self): assert not hasattr(wrapper, 'foo') - def test_next(self): - with open(self.mmap_file, 'r') as target: + def test_next(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 05423474f330a..4e2b2af0ebfe7 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -39,8 +39,9 @@ @td.skip_if_no('xlrd', '0.9') class SharedItems(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -49,7 +50,6 @@ def setup_method(self, method): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- @@ -68,8 +68,7 @@ def get_csv_refdf(self, basename): def get_excelfile(self, basename, ext): """ - Return test data ExcelFile instance. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data ExcelFile instance. Parameters ---------- @@ -86,8 +85,7 @@ def get_excelfile(self, basename, ext): def get_exceldf(self, basename, ext, *args, **kwds): """ - Return test data DataFrame. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data DataFrame. Parameters ---------- diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a56946b82b027..9c6a8de7ed446 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1,6 +1,5 @@ from __future__ import print_function -import glob import os import re import threading @@ -25,8 +24,18 @@ import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network +HERE = os.path.dirname(__file__) -DATA_PATH = tm.get_data_path() + +@pytest.fixture(params=[ + 'chinese_utf-16.html', + 'chinese_utf-32.html', + 'chinese_utf-8.html', + 'letz_latin1.html', +]) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -44,11 +53,11 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): @td.skip_if_no('bs4') -def test_bs4_version_fails(monkeypatch): +def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): - read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): @@ -59,8 +68,8 @@ def test_invalid_flavor(): @td.skip_if_no('bs4') @td.skip_if_no('lxml') -def test_same_ordering(): - filename = os.path.join(DATA_PATH, 'valid_markup.html') +def test_same_ordering(datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -72,11 +81,14 @@ def test_same_ordering(): pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): - spam_data = os.path.join(DATA_PATH, 'spam.html') - spam_data_kwargs = {} - if PY3: - spam_data_kwargs['encoding'] = 'UTF-8' - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data_kwargs = {} + if PY3: + self.spam_data_kwargs['encoding'] = 'UTF-8' + self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -272,7 +284,8 @@ def test_invalid_url(self): @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), + 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: @@ -326,7 +339,7 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) @@ -352,9 +365,9 @@ def test_python_docs_table(self): assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow - def test_thousands_macau_stats(self): + def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] @@ -362,9 +375,9 @@ def test_thousands_macau_stats(self): assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow - def test_thousands_macau_index_col(self): + def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -518,8 +531,8 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) - def test_nyse_wsj_commas_table(self): - data = os.path.join(DATA_PATH, 'nyse_wsj.html') + def test_nyse_wsj_commas_table(self, datapath): + data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] @@ -530,7 +543,7 @@ def test_nyse_wsj_commas_table(self): tm.assert_index_equal(df.columns, columns) @pytest.mark.slow - def test_banklist_header(self): + def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -541,7 +554,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] - ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape @@ -658,19 +671,19 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + def test_computer_sales_page(self, datapath): + data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self): - data = os.path.join(DATA_PATH, 'wikipedia_states.html') + def test_wikipedia_states_table(self, datapath): + data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] @@ -784,15 +797,15 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self): - filename = os.path.join(DATA_PATH, 'valid_markup.html') + def test_works_on_valid_markup(self, datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self): - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + def test_fallback_success(self, datapath): + banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): @@ -835,22 +848,23 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table - @pytest.mark.parametrize("f", glob.glob( - os.path.join(DATA_PATH, 'html_encoding', '*.html'))) - def test_encode(self, f): - _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') + def test_encode(self, html_encoding_file): + _, encoding = os.path.splitext( + os.path.basename(html_encoding_file) + )[0].split('_') try: - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() - from_filename = self.read_html(f, encoding=encoding, + from_filename = self.read_html(html_encoding_file, + encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) @@ -906,7 +920,7 @@ def seekable(self): assert self.read_html(bad) @pytest.mark.slow - def test_importcheck_thread_safety(self): + def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): @@ -921,7 +935,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = os.path.join(DATA_PATH, 'valid_markup.html') + filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index cfac77291803d..491d5fe33cc33 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import os import datetime +import glob import numpy as np from distutils.version import LooseVersion @@ -837,13 +838,13 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -def legacy_packers_versions(): - # yield the packers versions - path = tm.get_data_path('legacy_msgpack') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - yield v +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_msgpack", "*", "*.msgpack")) + + +@pytest.fixture(params=files) +def legacy_packer(request, datapath): + return datapath(request.param) class TestMsgpack(object): @@ -920,24 +921,20 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('version', legacy_packers_versions()) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - version): - - pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): - continue - vf = os.path.join(pth, f) - try: - with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - vf, version) - except ImportError: - # blosc not installed - continue - n += 1 - assert n > 0, 'Msgpack files are not tested' + legacy_packer, datapath): + + version = os.path.basename(os.path.dirname(legacy_packer)) + + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and version.startswith('0.17.') and + legacy_packer.split('.')[-4][-1] == '2'): + msg = "Files packed in Py2 can't be read in Py3 ({})" + pytest.skip(msg.format(version)) + try: + with catch_warnings(record=True): + self.compare(current_packers_data, all_packers_data, + legacy_packer, version) + except ImportError: + # blosc not installed + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..45cbbd43cd6a8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,7 +12,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ - +import glob import pytest from warnings import catch_warnings @@ -184,27 +184,25 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_pickle", "*", "*.pickle")) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + # --------------------- # tests # --------------------- -def legacy_pickle_versions(): - # yield the pickle versions - path = tm.get_data_path('legacy_pickle') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - for f in os.listdir(p): - yield (v, f) - - -@pytest.mark.parametrize('version, f', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version, f): +def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + version = os.path.basename(os.path.dirname(legacy_pickle)) with catch_warnings(record=True): - compare(current_pickle_data, vf, version) + compare(current_pickle_data, legacy_pickle, version) def test_round_trip_current(current_pickle_data): @@ -260,12 +258,11 @@ def python_unpickler(path): compare_element(result, expected, typ) -def test_pickle_v0_14_1(): +def test_pickle_v0_14_1(datapath): cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -275,14 +272,13 @@ def test_pickle_v0_14_1(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -def test_pickle_v0_15_2(): +def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5ac91c15047ff..9cbb62f72f0a0 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4452,28 +4452,27 @@ def f(): store.select('df') tm.assert_raises_regex(ClosedFileError, 'file is not open', f) - def test_pytables_native_read(self): - + def test_pytables_native_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native.h5'), + datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] assert isinstance(d2, DataFrame) @pytest.mark.skipif(PY35 and is_platform_windows(), reason="native2 read fails oddly on windows / 3.5") - def test_pytables_native2_read(self): + def test_pytables_native2_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native2.h5'), + datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] assert isinstance(d1, DataFrame) - def test_legacy_table_read(self): + def test_legacy_table_read(self, datapath): # legacy table types with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_table.h5'), + datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'), mode='r') as store: with catch_warnings(record=True): @@ -5120,7 +5119,7 @@ def test_fspath(self): with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) - def test_read_py2_hdf_file_in_py3(self): + def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 @@ -5135,8 +5134,8 @@ def test_read_py2_hdf_file_in_py3(self): ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) with ensure_clean_store( - tm.get_data_path( - 'legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), + datapath('io', 'data', 'legacy_hdf', + 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), mode='r') as store: result = store['p'] assert_frame_equal(result, expected) @@ -5533,14 +5532,14 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self): + def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) with ensure_clean_store( - tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), mode='r') as store: result = store['df'] assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3ab74d37a2bc..f8f742c5980ac 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -22,7 +22,6 @@ import pytest import sqlite3 import csv -import os import warnings import numpy as np @@ -184,9 +183,11 @@ class MixInBase(object): def teardown_method(self, method): - for tbl in self._get_all_tables(): - self.drop_table(tbl) - self._close_conn() + # if setup fails, there may not be a connection to close. + if hasattr(self, 'conn'): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() class MySQLMixIn(MixInBase): @@ -253,9 +254,9 @@ def _get_exec(self): else: return self.conn.cursor() - def _load_iris_data(self): + def _load_iris_data(self, datapath): import io - iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + iris_csv_file = datapath('io', 'data', 'iris.csv') self.drop_table('iris') self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) @@ -503,9 +504,10 @@ class _TestSQLApi(PandasSQLTest): flavor = 'sqlite' mode = None - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_iris_view() self._load_test1_data() self._load_test2_data() @@ -1025,8 +1027,9 @@ class _EngineToConnMixin(object): A mixin that causes setup_connect to create a conn rather than an engine. """ - def setup_method(self, method): - super(_EngineToConnMixin, self).setup_method(method) + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + super(_EngineToConnMixin, self).setup_method(datapath) engine = self.conn conn = engine.connect() self.__tx = conn.begin() @@ -1034,12 +1037,14 @@ def setup_method(self, method): self.__engine = engine self.conn = conn - def teardown_method(self, method): + yield + self.__tx.rollback() self.conn.close() self.conn = self.__engine self.pandasSQL = sql.SQLDatabase(self.__engine) - super(_EngineToConnMixin, self).teardown_method(method) + # XXX: + # super(_EngineToConnMixin, self).teardown_method(method) @pytest.mark.single @@ -1136,7 +1141,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ flavor = None - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1149,10 +1154,11 @@ def setup_class(cls): msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) pytest.skip(msg) - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.setup_connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_raw_sql() self._load_test1_data() @@ -1920,11 +1926,12 @@ class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): def connect(cls): return sqlite3.connect(':memory:') - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_iris_data() + self._load_iris_data(datapath) self._load_test1_data() @@ -2135,8 +2142,9 @@ def _skip_if_no_pymysql(): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - def setup_method(self, method): - self.method = method + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function self.conn = sqlite3.connect(':memory:') def test_basic(self): @@ -2215,8 +2223,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): create_sql = """ CREATE TABLE test ( @@ -2236,7 +2243,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): pass @@ -2341,7 +2348,7 @@ def clean_up(test_table_to_drop): "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn): - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): _skip_if_no_pymysql() @@ -2370,7 +2377,8 @@ def setup_class(cls): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): _skip_if_no_pymysql() import pymysql try: @@ -2396,7 +2404,7 @@ def setup_method(self, method): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - self.method = method + self.method = request.function def test_basic(self): _skip_if_no_pymysql() @@ -2501,8 +2509,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): _skip_if_no_pymysql() drop_sql = "DROP TABLE IF EXISTS test" create_sql = """ @@ -2525,7 +2532,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): _skip_if_no_pymysql() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f3a465da4e87f..cff63516f4086 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -25,8 +25,8 @@ @pytest.fixture -def dirpath(): - return tm.get_data_path() +def dirpath(datapath): + return datapath("io", "data") @pytest.fixture @@ -39,8 +39,9 @@ def parsed_114(dirpath): class TestStata(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f65791329f2f1..09687dd97bd43 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -74,11 +74,6 @@ def setup_method(self, method): else: self.default_figsize = (8.0, 6.0) self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' - # common test data - from pandas import read_csv - base = os.path.join(os.path.dirname(curpath()), os.pardir) - path = os.path.join(base, 'tests', 'data', 'iris.csv') - self.iris = read_csv(path) n = 100 with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 2c2d371921d2f..a45b17ec98261 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -46,10 +46,9 @@ def test_boxplot_deprecated(self): by='indic') @pytest.mark.slow - def test_radviz_deprecated(self): - df = self.iris + def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): - plotting.radviz(frame=df, class_column='Name') + plotting.radviz(frame=iris, class_column='Name') @pytest.mark.slow def test_plot_params(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c82c939584dc7..0473610ea2f8f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -100,11 +100,11 @@ def test_scatter_matrix_axis(self): axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow - def test_andrews_curves(self): + def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm - df = self.iris + df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') @@ -165,11 +165,11 @@ def test_andrews_curves(self): andrews_curves(data=df, class_column='Name') @pytest.mark.slow - def test_parallel_coordinates(self): + def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates from matplotlib import cm - df = self.iris + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') @@ -234,11 +234,11 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] @pytest.mark.slow - def test_radviz(self): + def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm - df = self.iris + df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') @@ -272,8 +272,8 @@ def test_radviz(self): self._check_colors(handles, facecolors=colors) @pytest.mark.slow - def test_subplot_titles(self): - df = self.iris.drop('Name', axis=1).head() + def test_subplot_titles(self, iris): + df = iris.drop('Name', axis=1).head() # Use the column names as the subplot titles title = list(df.columns) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cebbcc41c3e17..59b53cd23010e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,4 +1,3 @@ -import os import pytest import pytz @@ -13,8 +12,8 @@ class TestAsOfMerge(object): - def read_data(self, name, dedupe=False): - path = os.path.join(tm.get_data_path(), name) + def read_data(self, datapath, name, dedupe=False): + path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') @@ -23,15 +22,17 @@ def read_data(self, name, dedupe=False): x.time = to_datetime(x.time) return x - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): - self.trades = self.read_data('trades.csv') - self.quotes = self.read_data('quotes.csv', dedupe=True) - self.asof = self.read_data('asof.csv') - self.tolerance = self.read_data('tolerance.csv') - self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.trades = self.read_data(datapath, 'trades.csv') + self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) + self.asof = self.read_data(datapath, 'asof.csv') + self.tolerance = self.read_data(datapath, 'tolerance.csv') + self.allow_exact_matches = self.read_data(datapath, + 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( - 'allow_exact_matches_and_tolerance.csv') + datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ @@ -423,11 +424,11 @@ def test_multiby_indexed(self): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) - def test_basic2(self): + def test_basic2(self, datapath): - expected = self.read_data('asof2.csv') - trades = self.read_data('trades2.csv') - quotes = self.read_data('quotes2.csv', dedupe=True) + expected = self.read_data(datapath, 'asof2.csv') + trades = self.read_data(datapath, 'trades2.csv') + quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', @@ -467,14 +468,14 @@ def test_valid_join_keys(self): merge_asof(trades, quotes, by='ticker') - def test_with_duplicates(self): + def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') - expected = self.read_data('asof.csv') + expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 5ea27f9e34e1c..807fb2530603a 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -282,10 +282,10 @@ def test_round_frac(self): result = tmod._round_frac(0.000123456, precision=2) assert result == 0.00012 - def test_qcut_binning_issues(self): + def test_qcut_binning_issues(self, datapath): # #1978, 1979 - path = os.path.join(tm.get_data_path(), 'cut_data.csv') - arr = np.loadtxt(path) + cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv')) + arr = np.loadtxt(cut_file) result = qcut(arr, 20) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 0c08d813a7f1b..00701ca2be946 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,3 @@ -import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta @@ -455,14 +454,15 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - def test_pickle_v0_15_2(self): + def test_pickle_v0_15_2(self, datapath): offsets = {'DateOffset': DateOffset(years=1), 'MonthBegin': MonthBegin(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'Week': Week(1)} - pickle_path = os.path.join(tm.get_data_path(), - 'dateoffset_0_15_2.pickle') + + pickle_path = datapath('tseries', 'offsets', 'data', + 'dateoffset_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -1854,12 +1854,10 @@ def _check_roundtrip(obj): _check_roundtrip(self.offset2) _check_roundtrip(self.offset * 2) - def test_pickle_compat_0_14_1(self): + def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - - pth = tm.get_data_path() - - cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index ab7c4fb528452..4d34987e14f75 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pandas as pd import pytest import numpy as np @@ -841,3 +842,15 @@ def test_locale(self): # GH9744 locales = tm.get_locales() assert len(locales) >= 1 + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError): + datapath('not_a_file') + + result = datapath('data', 'iris.csv') + expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + assert result == expected diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 27c24e3a68079..c6ab24403d58d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,7 +23,6 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ - import pytest import locale from distutils.version import LooseVersion diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6384eca9849f6..b7edbff00a4b9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import warnings -import inspect import os import subprocess import locale @@ -751,15 +750,6 @@ def ensure_clean(filename=None, return_filelike=False): print("Exception on removing file: {error}".format(error=e)) -def get_data_path(f=''): - """Return the path of a data file, these are relative to the current test - directory. - """ - # get our callers file - _, filename, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[1] - base_dir = os.path.abspath(os.path.dirname(filename)) - return os.path.join(base_dir, 'data', f) - # ----------------------------------------------------------------------------- # Comparators diff --git a/setup.cfg b/setup.cfg index 6d9657737a8bd..9ec967c25e225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,4 +32,5 @@ markers = slow: mark a test as slow network: mark a test as network high_memory: mark a test as a high-memory only -doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +addopts = --strict-data-files +doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL \ No newline at end of file diff --git a/setup.py b/setup.py index c5831eb097767..5d6bbbcf7b862 100755 --- a/setup.py +++ b/setup.py @@ -734,11 +734,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], - 'pandas.tests.io': ['data/legacy_hdf/*.h5', - 'data/legacy_pickle/*/*.pickle', - 'data/legacy_msgpack/*/*.msgpack', - 'data/html_encoding/*.html']}, + package_data={'': ['templates/*', '_libs/*.dll']}, ext_modules=extensions, maintainer_email=EMAIL, description=DESCRIPTION, From db51f0a57030fd71d26df00c2e3dd63b7fd542b9 Mon Sep 17 00:00:00 2001 From: david-liu-brattle-1 <36486871+david-liu-brattle-1@users.noreply.github.com> Date: Tue, 26 Jun 2018 18:19:41 -0400 Subject: [PATCH 088/116] Cleanup clipboard tests (#21163) (cherry picked from commit 9d38e0ef5842fafcc4e391abc6aba486684e6dc7) --- pandas/tests/io/test_clipboard.py | 196 ++++++++++++++++++++---------- 1 file changed, 129 insertions(+), 67 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 98c0effabec84..80fddd50fc9a8 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -9,10 +9,11 @@ from pandas import DataFrame from pandas import read_clipboard from pandas import get_option +from pandas.compat import PY2 from pandas.util import testing as tm from pandas.util.testing import makeCustomDataframe as mkdf from pandas.io.clipboard.exceptions import PyperclipException -from pandas.io.clipboard import clipboard_set +from pandas.io.clipboard import clipboard_set, clipboard_get try: @@ -22,73 +23,134 @@ _DEPS_INSTALLED = 0 +def build_kwargs(sep, excel): + kwargs = {} + if excel != 'default': + kwargs['excel'] = excel + if sep != 'default': + kwargs['sep'] = sep + return kwargs + + +@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii', + 'colwidth', 'mixed', 'float', 'int']) +def df(request): + data_type = request.param + + if data_type == 'delims': + return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], + 'b': ['hi\'j', 'k\'\'lm']}) + elif data_type == 'utf8': + return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], + 'b': ['øπ∆˚¬', 'œ∑´®']}) + elif data_type == 'string': + return mkdf(5, 3, c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'long': + max_rows = get_option('display.max_rows') + return mkdf(max_rows + 1, 3, + data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'nonascii': + return pd.DataFrame({'en': 'in English'.split(), + 'es': 'en español'.split()}) + elif data_type == 'colwidth': + _cw = get_option('display.max_colwidth') + 1 + return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'mixed': + return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, + 'b': np.arange(1, 6), + 'c': list('abcde')}) + elif data_type == 'float': + return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'int': + return mkdf(5, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + else: + raise ValueError + + @pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(object): - - @classmethod - def setup_class(cls): - cls.data = {} - cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['float'] = mkdf(5, 3, - data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - - # Test columns exceeding "max_colwidth" (GH8305) - _cw = get_option('display.max_colwidth') + 1 - cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test GH-5346 - max_rows = get_option('display.max_rows') - cls.data['longdf'] = mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test for non-ascii text: GH9263 - cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - # unicode round trip test for GH 13747, GH 12529 - cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - cls.data_types = list(cls.data.keys()) - - @classmethod - def teardown_class(cls): - del cls.data_types, cls.data - - def check_round_trip_frame(self, data_type, excel=None, sep=None, + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data = self.data[data_type] data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - if sep is not None: - result = read_clipboard(sep=sep, index_col=0, encoding=encoding) - else: - result = read_clipboard(encoding=encoding) + result = read_clipboard(sep=sep or '\t', index_col=0, + encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) - def test_round_trip_frame_sep(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, sep=',') - self.check_round_trip_frame(dt, sep=r'\s+') - self.check_round_trip_frame(dt, sep='|') - - def test_round_trip_frame_string(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, excel=False) - - def test_round_trip_frame(self): - for dt in self.data_types: - self.check_round_trip_frame(dt) + # Test that default arguments copy as tab delimited + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + def test_round_trip_frame(self, df): + self.check_round_trip_frame(df) + + # Test that explicit delimiters are respected + @pytest.mark.parametrize('sep', ['\t', ',', '|']) + def test_round_trip_frame_sep(self, df, sep): + self.check_round_trip_frame(df, sep=sep) + + # Test white space separator + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + def test_round_trip_frame_string(self, df): + df.to_clipboard(excel=False, sep=None) + result = read_clipboard() + assert df.to_string() == result.to_string() + assert df.shape == result.shape + + # Two character separator is not supported in to_clipboard + # Test that multi-character separators are not silently passed + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_excel_sep_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=True, sep=r'\t') + + # Separator is ignored when excel=False and should produce a warning + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_copy_delim_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=False, sep='\t') + + # Tests that the default behavior of to_clipboard is tab + # delimited and excel="True" + @pytest.mark.xfail(reason="to_clipboard defaults to space delim. Issue in " + "#21104, Fixed in #21111") + @pytest.mark.parametrize('sep', ['\t', None, 'default']) + @pytest.mark.parametrize('excel', [True, None, 'default']) + def test_clipboard_copy_tabs_default(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + if PY2: + # to_clipboard copies unicode, to_csv produces bytes. This is + # expected behavior + assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t') + else: + assert clipboard_get() == df.to_csv(sep='\t') + + # Tests reading of white space separated tables + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly. in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + @pytest.mark.parametrize('sep', [None, 'default']) + @pytest.mark.parametrize('excel', [False]) + def test_clipboard_copy_strings(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + result = read_clipboard(sep=r'\s+') + assert result.to_string() == df.to_string() + assert df.shape == result.shape def test_read_clipboard_infer_excel(self): # gh-19010: avoid warnings @@ -124,15 +186,15 @@ def test_read_clipboard_infer_excel(self): tm.assert_frame_equal(res, exp) - def test_invalid_encoding(self): + def test_invalid_encoding(self, df): # test case for testing invalid encoding - data = self.data['string'] with pytest.raises(ValueError): - data.to_clipboard(encoding='ascii') + df.to_clipboard(encoding='ascii') with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - def test_round_trip_valid_encodings(self): - for enc in ['UTF-8', 'utf-8', 'utf8']: - for dt in self.data_types: - self.check_round_trip_frame(dt, encoding=enc) + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + def test_round_trip_valid_encodings(self, enc, df): + self.check_round_trip_frame(df, encoding=enc) From d9ada974d0f73c72953fcece56e084dc277bc4c7 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 27 Jun 2018 03:57:55 -0600 Subject: [PATCH 089/116] DOC: Fix versionadded directive typos in IntervalIndex (#21649) (cherry picked from commit b35cb1c127aae894c2a1ee5ab2f16987b91e9000) --- pandas/core/indexes/interval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index eb9d7efc06c27..23a655b9a51ee 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -160,7 +160,7 @@ class IntervalIndex(IntervalMixin, Index): dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Attributes ---------- @@ -438,7 +438,7 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -568,7 +568,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -619,7 +619,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -671,7 +671,7 @@ def to_tuples(self, na_tuple=True): Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- From 0a42f18687a1e586b09bfaa18b0ddc85e20d760a Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 29 Jun 2018 01:26:38 +0100 Subject: [PATCH 090/116] Fix Timestamp rounding (#21507) (cherry picked from commit 76ef7c459e752f72abc62e030fd1cea0117c1dca) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/_libs/tslibs/timestamps.pyx | 34 +++++++++++++------ .../indexes/datetimes/test_scalar_compat.py | 19 +++++++++++ .../tests/scalar/timestamp/test_unary_ops.py | 20 ++++++++++- 4 files changed, 62 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index b3da4d1c4e288..9d96e807dfd3e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,7 +54,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) -- +- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) .. _whatsnew_0232.performance: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ba5ebdab82ddc..123ccebf83a56 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -59,42 +59,51 @@ cdef inline object create_timestamp_from_ts(int64_t value, def round_ns(values, rounder, freq): + """ Applies rounding function at given frequency Parameters ---------- - values : int, :obj:`ndarray` - rounder : function + values : :obj:`ndarray` + rounder : function, eg. 'ceil', 'floor', 'round' freq : str, obj Returns ------- - int or :obj:`ndarray` + :obj:`ndarray` """ + from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos + + # GH21262 If the Timestamp is multiple of the freq str + # don't apply any rounding + mask = values % unit == 0 + if mask.all(): + return values + r = values.copy() + if unit < 1000: # for nano rounding, work with the last 6 digits separately # due to float precision buff = 1000000 - r = (buff * (values // buff) + unit * - (rounder((values % buff) * (1 / float(unit)))).astype('i8')) + r[~mask] = (buff * (values[~mask] // buff) + + unit * (rounder((values[~mask] % buff) * + (1 / float(unit)))).astype('i8')) else: if unit % 1000 != 0: msg = 'Precision will be lost using frequency: {}' warnings.warn(msg.format(freq)) - # GH19206 # to deal with round-off when unit is large if unit >= 1e9: divisor = 10 ** int(np.log10(unit / 1e7)) else: divisor = 10 - - r = (unit * rounder((values * (divisor / float(unit))) / divisor) - .astype('i8')) - + r[~mask] = (unit * rounder((values[~mask] * + (divisor / float(unit))) / divisor) + .astype('i8')) return r @@ -649,7 +658,10 @@ class Timestamp(_Timestamp): else: value = self.value - r = round_ns(value, rounder, freq) + value = np.array([value], dtype=np.int64) + + # Will only ever contain 1 element for timestamp + r = round_ns(value, rounder, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 9180bb0af3af3..801dcb91b124e 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -134,6 +134,21 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + def test_no_rounding_occurs(self, tz): + # GH 21262 + rng = date_range(start='2016-01-01', periods=5, + freq='2Min', tz=tz) + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), + ]) + + tm.assert_index_equal(rng.round(freq='2T'), expected_rng) + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), @@ -143,6 +158,10 @@ def test_round(self, tz): ['1823-01-01 00:00:01.000000020']), (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), + (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), + (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), + (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', ('NaT', '1823-01-01 00:00:01')), (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index aecddab8477fc..dbe31ccb11114 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -118,6 +118,25 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected + @pytest.mark.parametrize('test_input, freq, expected', [ + ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), + ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), + ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), + ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), + ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), + ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), + ]) + @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + def test_round_minute_freq(self, test_input, freq, expected, rounder): + # Ensure timestamps that shouldnt round dont! + # GH#21262 + + dt = Timestamp(test_input) + expected = Timestamp(expected) + func = getattr(dt, rounder) + result = func(freq) + assert result == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') @@ -257,7 +276,6 @@ def test_timestamp(self): if PY3: # datetime.timestamp() converts in the local timezone with tm.set_timezone('UTC'): - # should agree with datetime.timestamp method dt = ts.to_pydatetime() assert dt.timestamp() == ts.timestamp() From 2c00914e9addaa57d6b9f3308f25b5755e4dcc1a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 Jun 2018 02:38:39 +0200 Subject: [PATCH 091/116] API/REGR: (re-)allow neg/pos unary operation on object dtype (#21590) (cherry picked from commit 8cb6be0eced3bd3742efd0c03b2d903e3513cb11) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/generic.py | 7 +++++-- pandas/tests/frame/test_operators.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9d96e807dfd3e..07ce99d4f19aa 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,6 +54,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 50a5c10a6865f..02462218e8b02 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_dict_like, is_re_compilable, is_period_arraylike, + is_object_dtype, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable @@ -1117,7 +1118,8 @@ def __neg__(self): values = com._values_from_object(self) if is_bool_dtype(values): arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.neg(values) else: raise TypeError("Unary negative expects numeric dtype, not {}" @@ -1128,7 +1130,8 @@ def __pos__(self): values = com._values_from_object(self) if (is_bool_dtype(values) or is_period_arraylike(values)): arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.pos(values) else: raise TypeError("Unary plus expects numeric dtype, not {}" diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 5df50f3d7835b..fdf50805ad818 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -3,6 +3,7 @@ from __future__ import print_function from collections import deque from datetime import datetime +from decimal import Decimal import operator import pytest @@ -282,6 +283,17 @@ def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df, expected', [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), + ]) + def test_neg_object(self, df, expected): + # GH 21380 + df = pd.DataFrame({'a': df}) + expected = pd.DataFrame({'a': expected}) + assert_frame_equal(-df, expected) + assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), @@ -307,6 +319,15 @@ def test_pos_numeric(self, df): @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), + pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), + ]) + def test_pos_object(self, df): + # GH 21380 + assert_frame_equal(+df, df) + assert_series_equal(+df['a'], df['a']) + + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), ]) def test_pos_raises(self, df): From dddc81b7fb6d938ad96f40b6953e6db729c96da4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 Jun 2018 02:39:45 +0200 Subject: [PATCH 092/116] API: re-allow duplicate index level names (#21423) (cherry picked from commit 66b517c2f51ed20d4c6823272d5c2a0f47f96d84) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/multi.py | 19 +++++------- pandas/core/reshape/reshape.py | 12 ++++++++ pandas/tests/frame/test_alter_axes.py | 37 +++++++++++++++++++----- pandas/tests/frame/test_reshape.py | 10 +++++++ pandas/tests/groupby/test_categorical.py | 8 ++--- pandas/tests/indexes/test_multi.py | 25 +++++++++------- pandas/tests/io/test_pytables.py | 6 ++++ pandas/tests/reshape/test_pivot.py | 10 +++++-- 9 files changed, 90 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 07ce99d4f19aa..ab9c3bc3857d6 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -53,6 +53,7 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) +- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 80bf73cfe7dd3..33db32cfe1166 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -672,30 +672,18 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) - used = {} else: level = [self._get_level_number(l) for l in level] - used = {self.levels[l].name: l - for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): if name is not None: - # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError('{}.name must be a hashable type' .format(self.__class__.__name__)) - - if name in used: - raise ValueError( - 'Duplicated level name: "{}", assigned to ' - 'level {}, is already used for level ' - '{}.'.format(name, l, used[name])) - self.levels[l].rename(name, inplace=True) - used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") @@ -2935,6 +2923,13 @@ def isin(self, values, level=None): else: return np.lib.arraysetops.in1d(labs, sought_labels) + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return sum(name == n for n in self.names) > 1 + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2757e0797a410..3d9e84954a63b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index.remove_unused_levels() + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The index " + "names are not unique.".format(level=level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -528,6 +534,12 @@ def factorize(index): N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The column " + "names are not unique.".format(level=level)) + raise ValueError(msg) + # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 164d6746edec0..21961906c39bb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,19 +130,27 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' - @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + @pytest.mark.parametrize( + 'level', ['a', pd.Series(range(0, 8, 2), name='a')]) def test_set_index_duplicate_names(self, level): - # GH18872 + # GH18872 - GH19029 df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) # Pass an existing level name: df.index.name = 'a' - pytest.raises(ValueError, df.set_index, level, append=True) - pytest.raises(ValueError, df.set_index, [level], append=True) - - # Pass twice the same level name: - df.index.name = 'c' - pytest.raises(ValueError, df.set_index, [level, level]) + expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)], + names=['a', 'a']) + result = df.set_index(level, append=True) + tm.assert_index_equal(result.index, expected) + result = df.set_index([level], append=True) + tm.assert_index_equal(result.index, expected) + + # Pass twice the same level name (only works with passing actual data) + if isinstance(level, pd.Series): + result = df.set_index([level, level]) + expected = pd.MultiIndex.from_tuples( + [(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a']) + tm.assert_index_equal(result.index, expected) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], @@ -617,6 +625,19 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d05321abefca6..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,16 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack('c1') + + with pytest.raises(ValueError): + df.T.stack('c1') + def test_unstack_unused_levels(self): # GH 17845: unused labels in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0fec6a8f96a24..cb76195eacf40 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -555,15 +555,11 @@ def test_as_index(): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 3ede83b5969ce..40e64d99ac440 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -655,22 +655,27 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - ['1', 'a', '1']]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) def test_duplicate_level_names(self, names): - # GH18872 - pytest.raises(ValueError, pd.MultiIndex.from_product, - [[0, 1]] * 3, names=names) + # GH18872, GH19029 + mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names # With .rename() mi = pd.MultiIndex.from_product([[0, 1]] * 3) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names) + mi = mi.rename(names) + assert mi.names == names # With .rename(., level=) - mi.rename(names[0], level=1, inplace=True) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names[:2], level=[0, 2]) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + def test_duplicate_level_names_access_raises(self): + self.index.names = ['foo', 'foo'] + tm.assert_raises_regex(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 9cbb62f72f0a0..7dafc9603f96d 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1842,6 +1842,12 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + pytest.raises(ValueError, store.append, 'df', df) + # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3ec60d50f2792..b71954163f9e1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1729,9 +1729,15 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # GH 13279 s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name='foo') + expected = pd.DataFrame(np.eye(3, dtype=np.int64), + index=expected_index, + columns=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) From 06d76e0c6dc0008510e7381cb774f183b8e8271b Mon Sep 17 00:00:00 2001 From: david-liu-brattle-1 <36486871+david-liu-brattle-1@users.noreply.github.com> Date: Fri, 29 Jun 2018 08:22:15 -0400 Subject: [PATCH 093/116] BUG: to_clipboard fails to format output for Excel (#21111) (cherry picked from commit dc45fbafef172e357cb5decdeab22de67160f5b7) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/clipboards.py | 32 +++++++++++++++++++++++++------ pandas/tests/io/test_clipboard.py | 16 ---------------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ab9c3bc3857d6..608db7487c1e4 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -57,6 +57,7 @@ Fixed Regressions - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) +- Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) .. _whatsnew_0232.performance: diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index dcc221ce978b3..b3f40b3a2429c 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -1,6 +1,7 @@ """ io on the clipboard """ from pandas import compat, get_option, option_context, DataFrame -from pandas.compat import StringIO, PY2 +from pandas.compat import StringIO, PY2, PY3 +import warnings def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover @@ -32,7 +33,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does - if compat.PY3: + if PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or @@ -55,11 +56,27 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - sep = r'\t' + sep = '\t' + # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' + # Regex separator currently only works with python engine. + # Default to python if separator is multi-character (regex) + if len(sep) > 1 and kwargs.get('engine') is None: + kwargs['engine'] = 'python' + elif len(sep) > 1 and kwargs.get('engine') == 'c': + warnings.warn('read_clipboard with regex separator does not work' + ' properly with c engine') + + # In PY2, the c table reader first encodes text with UTF-8 but Python + # table reader uses the format of the passed string. For consistency, + # encode strings for python engine so that output from python and c + # engines produce consistent results + if kwargs.get('engine') == 'python' and PY2: + text = text.encode('utf-8') + return read_table(StringIO(text), sep=sep, **kwargs) @@ -99,7 +116,7 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover if excel: try: if sep is None: - sep = r'\t' + sep = '\t' buf = StringIO() # clipboard_set (pyperclip) expects unicode obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) @@ -108,8 +125,11 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover text = text.decode('utf-8') clipboard_set(text) return - except: - pass + except TypeError: + warnings.warn('to_clipboard in excel mode requires a single ' + 'character separator.') + elif sep is not None: + warnings.warn('to_clipboard with excel=False ignores the sep argument') if isinstance(obj, DataFrame): # str(df) has various unhelpful defaults, like truncation diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 80fddd50fc9a8..a6b331685e72a 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -88,8 +88,6 @@ def check_round_trip_frame(self, data, excel=None, sep=None, tm.assert_frame_equal(data, result, check_dtype=False) # Test that default arguments copy as tab delimited - @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' - 'Issue in #21104, Fixed in #21111') def test_round_trip_frame(self, df): self.check_round_trip_frame(df) @@ -99,10 +97,6 @@ def test_round_trip_frame_sep(self, df, sep): self.check_round_trip_frame(df, sep=sep) # Test white space separator - @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " - "aren't handled correctly in default c engine. Fixed " - "in #21111 by defaulting to python engine for " - "whitespace separator") def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) result = read_clipboard() @@ -111,21 +105,17 @@ def test_round_trip_frame_string(self, df): # Two character separator is not supported in to_clipboard # Test that multi-character separators are not silently passed - @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") def test_excel_sep_warning(self, df): with tm.assert_produces_warning(): df.to_clipboard(excel=True, sep=r'\t') # Separator is ignored when excel=False and should produce a warning - @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") def test_copy_delim_warning(self, df): with tm.assert_produces_warning(): df.to_clipboard(excel=False, sep='\t') # Tests that the default behavior of to_clipboard is tab # delimited and excel="True" - @pytest.mark.xfail(reason="to_clipboard defaults to space delim. Issue in " - "#21104, Fixed in #21111") @pytest.mark.parametrize('sep', ['\t', None, 'default']) @pytest.mark.parametrize('excel', [True, None, 'default']) def test_clipboard_copy_tabs_default(self, sep, excel, df): @@ -139,10 +129,6 @@ def test_clipboard_copy_tabs_default(self, sep, excel, df): assert clipboard_get() == df.to_csv(sep='\t') # Tests reading of white space separated tables - @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " - "aren't handled correctly. in default c engine. Fixed " - "in #21111 by defaulting to python engine for " - "whitespace separator") @pytest.mark.parametrize('sep', [None, 'default']) @pytest.mark.parametrize('excel', [False]) def test_clipboard_copy_strings(self, sep, excel, df): @@ -193,8 +179,6 @@ def test_invalid_encoding(self, df): with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' - 'Issue in #21104, Fixed in #21111') @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) def test_round_trip_valid_encodings(self, enc, df): self.check_round_trip_frame(df, encoding=enc) From 2fccdedda2c4bb0e5b9edce8269cdecc973b191d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 17:26:43 +0200 Subject: [PATCH 094/116] BUG: fix reindexing MultiIndex with categorical datetime-like level (#21657) (cherry picked from commit 1cc547185b92073a3465ea105055d7791e9e6c48) --- doc/source/whatsnew/v0.23.2.txt | 2 ++ pandas/core/indexes/multi.py | 26 +++++++++---------- .../tests/frame/test_axis_select_reindex.py | 15 ++++++++++- pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++ pandas/tests/indexes/test_multi.py | 12 +++++++-- 5 files changed, 58 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 608db7487c1e4..bef90506477ed 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -55,6 +55,8 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` + with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) - Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 33db32cfe1166..9a4aa15f4cc25 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -11,6 +11,8 @@ from pandas.compat.numpy import function as nv from pandas import compat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( _ensure_int64, _ensure_platform_int, @@ -808,20 +810,16 @@ def values(self): return self._tuples values = [] - for lev, lab in zip(self.levels, self.labels): - # Need to box timestamps, etc. - box = hasattr(lev, '_box_values') - # Try to minimize boxing. - if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._ndarray_values, - lab)) - elif box: - taken = algos.take_1d(lev._box_values(lev._ndarray_values), - lab, - fill_value=lev._na_value) - else: - taken = algos.take_1d(np.asarray(lev._values), lab) - values.append(taken) + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals.get_values() + if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype)) + or hasattr(vals, '_box_values')): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) self._tuples = lib.fast_zip(values) return self._tuples diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 0e0d6598f5101..004fb4eb0c128 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -10,7 +10,7 @@ import numpy as np from pandas.compat import lrange, lzip, u -from pandas import (compat, DataFrame, Series, Index, MultiIndex, +from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical, date_range, isna) import pandas as pd @@ -1129,6 +1129,19 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + df = pd.DataFrame({'a': range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = pd.DataFrame( + {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + assert_frame_equal(result, expected) + data = [[1, 2, 3], [1, 2, 3]] @pytest.mark.parametrize('actual', [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cb76195eacf40..d021396a7acb3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -850,3 +850,23 @@ def test_empty_prod(): result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_categorical_datetime(): + # https://github.com/pandas-dev/pandas/issues/21390 + + df = pd.DataFrame({ + 'key1': pd.Categorical(list('abcbabcba')), + 'key2': pd.Categorical( + list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), + 'values': np.arange(9), + }) + result = df.groupby(['key1', 'key2']).mean() + + idx = pd.MultiIndex.from_product( + [pd.Categorical(['a', 'b', 'c']), + pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + names=['key1', 'key2']) + expected = pd.DataFrame( + {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 40e64d99ac440..a7e90207c9ad7 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -12,8 +12,8 @@ import pandas as pd -from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, - compat, date_range, period_range) +from pandas import (CategoricalIndex, Categorical, DataFrame, Index, + MultiIndex, compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype @@ -1595,6 +1595,14 @@ def test_get_indexer_nearest(self): with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + def test_hash_collisions(self): # non-smoke test that we don't get hash collisions From a74ee5496900e80bdc653899555fc701ce344bf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Jul 2018 10:28:19 -0500 Subject: [PATCH 095/116] BUG: Fix MI repr with long names (#21655) (cherry picked from commit ad76ffcca0d92c3885c279c80701c2f4a3f3f177) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/formats/format.py | 10 +++++-- pandas/tests/io/formats/test_format.py | 38 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index bef90506477ed..61d1b83ea8f2e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -55,6 +55,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed printing of DataFrames with hierarchical columns with long names (:issue:`21180`) - Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). - Fixed regression in unary negative operations with object dtype (:issue:`21380`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 12201f62946ac..c46f4b5ad9c18 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -636,10 +636,14 @@ def to_string(self): mid = int(round(n_cols / 2.)) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] - adj_dif -= (col_len + 1) # adjoin adds one + # adjoin adds one + adj_dif -= (col_len + 1) col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) - max_cols_adj = n_cols - self.index # subtract index column + # subtract index column + max_cols_adj = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_adj = max(max_cols_adj, 2) self.max_cols_adj = max_cols_adj # Call again _chk_truncate to cut frame appropriately @@ -778,7 +782,7 @@ def space_format(x, y): str_columns = list(zip(*[[space_format(x, y) for y in x] for x in fmt_columns])) - if self.sparsify: + if self.sparsify and len(str_columns): str_columns = _sparsify(str_columns) str_columns = [list(x) for x in zip(*str_columns)] diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 63b7cb3459069..191e3f37f1c37 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -305,6 +305,44 @@ def test_repr_non_interactive(self): assert not has_truncated_repr(df) assert not has_expanded_repr(df) + def test_repr_truncates_terminal_size(self): + # https://github.com/pandas-dev/pandas/issues/21180 + # TODO: use mock fixutre. + # This is being backported, so doing it directly here. + try: + from unittest import mock + except ImportError: + mock = pytest.importorskip("mock") + + terminal_size = (118, 96) + p1 = mock.patch('pandas.io.formats.console.get_terminal_size', + return_value=terminal_size) + p2 = mock.patch('pandas.io.formats.format.get_terminal_size', + return_value=terminal_size) + + index = range(5) + columns = pd.MultiIndex.from_tuples([ + ('This is a long title with > 37 chars.', 'cat'), + ('This is a loooooonger title with > 43 chars.', 'dog'), + ]) + df = pd.DataFrame(1, index=index, columns=columns) + + with p1, p2: + result = repr(df) + + h1, h2 = result.split('\n')[:2] + assert 'long' in h1 + assert 'loooooonger' in h1 + assert 'cat' in h2 + assert 'dog' in h2 + + # regular columns + df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]}) + with p1, p2: + result = repr(df2) + + assert df2.columns[0] in result.split('\n')[0] + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: From 1d3766c3fd303672f29be4a71919c37443450ad8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Jul 2018 21:05:12 +0200 Subject: [PATCH 096/116] DOC: clean-up 0.23.2 whatsnew file (#21750) (cherry picked from commit 2f0773f49a64d23774d66c30988c80541fd7bb6f) --- doc/source/whatsnew/v0.23.2.txt | 40 ++------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 61d1b83ea8f2e..2d7808363648b 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -62,19 +62,6 @@ Fixed Regressions - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) - Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) -.. _whatsnew_0232.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- -- - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- Build Changes ------------- @@ -86,55 +73,32 @@ Build Changes Bug Fixes ~~~~~~~~~ -**Groupby/Resample/Rolling** - -- -- - -**Timedelta** - -- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) - **Conversion** - Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) - Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - **Indexing** - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) - Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) - Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). -- **I/O** - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) -- - -**Plotting** - -- -- - -**Reshaping** - -- -- **Categorical** - Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) -- **Timezones** - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) -**Other** +**Timedelta** -- +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) From de4455663215d2a8767fbc14e29f1e5e320603d5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Jul 2018 15:49:24 -0500 Subject: [PATCH 097/116] RLS: release notes for 0.23.2 (#21752) (cherry picked from commit bd8ba3680eae9c19221ef7200928bcef68508f4a) --- doc/source/release.rst | 34 +++++++++++++++++++++++++++++++++ doc/source/whatsnew.rst | 2 ++ doc/source/whatsnew/v0.23.2.txt | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 2f7eedfbe9a45..08200d4d276cc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,40 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: https://pypi.org/project/pandas * Documentation: http://pandas.pydata.org +pandas 0.23.2 +------------- + +**Release date**: July 5, 2018 + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 17 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* David Krych +* Jacopo Rota + +* Jeff Reback +* Jeremy Schendel +* Joris Van den Bossche +* Kalyan Gokhale +* Matthew Roeschke +* Michael Odintsov + +* Ming Li +* Pietro Battiston +* Tom Augspurger +* Uddeshya Singh +* Vu Le + +* alimcmaster1 + +* david-liu-brattle-1 + +* gfyoung +* jbrockmendel + pandas 0.23.1 ------------- diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index eb9211d0ceb02..0972cc9432f8e 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.2.txt + .. include:: whatsnew/v0.23.1.txt .. include:: whatsnew/v0.23.0.txt diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 2d7808363648b..bd86576ad8586 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -64,7 +64,7 @@ Fixed Regressions Build Changes -------------- +~~~~~~~~~~~~~ - The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) From 9b0f560a73d11b2fa72c48d7fd16126b5137f349 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Jul 2018 17:04:24 -0500 Subject: [PATCH 098/116] RLS: 0.23.2 From e2f65df75efbfbb914f22605d139f73967211905 Mon Sep 17 00:00:00 2001 From: "meeseeksdev[bot]" Date: Fri, 6 Jul 2018 11:32:05 -0500 Subject: [PATCH 099/116] Backport PR #21771: Whatsnew note for v0.23.3 (#21772) --- doc/source/whatsnew/v0.23.3.txt | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.3.txt diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt new file mode 100644 index 0000000000000..d308cf7a3cfac --- /dev/null +++ b/doc/source/whatsnew/v0.23.3.txt @@ -0,0 +1,55 @@ +.. _whatsnew_0233: + +v0.23.3 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + + +.. contents:: What's new in v0.23.3 + :local: + :backlinks: none + +.. _whatsnew_0233.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0233.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Conversion** + +- +- + +**Indexing** + +- +- + +**I/O** + +- +- + +**Categorical** + +- +- + +**Timezones** + +- +- + +**Timedelta** + +- +- From d2b7b2b2913d5da18f8df476a51b7f2f521ed99d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 09:31:26 -0500 Subject: [PATCH 100/116] 0.23.3 fixup (#21788) * Move 0.23.3 to 0.23.4 * 0.23.3 whatsnew (cherry picked from commit a3f8f14b24032151ba57c36f0a70192e13bfd116) --- doc/source/whatsnew/v0.23.3.txt | 56 +++--------------------------- doc/source/whatsnew/v0.23.4.txt | 60 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 52 deletions(-) create mode 100644 doc/source/whatsnew/v0.23.4.txt diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt index d308cf7a3cfac..b8adce27d2523 100644 --- a/doc/source/whatsnew/v0.23.3.txt +++ b/doc/source/whatsnew/v0.23.3.txt @@ -1,55 +1,7 @@ .. _whatsnew_0233: -v0.23.3 -------- +v0.23.3 (July 7, 2018) +---------------------- -This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes -and bug fixes. We recommend that all users upgrade to this version. - - -.. contents:: What's new in v0.23.3 - :local: - :backlinks: none - -.. _whatsnew_0233.fixed_regressions: - -Fixed Regressions -~~~~~~~~~~~~~~~~~ - -- -- - -.. _whatsnew_0233.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -**Conversion** - -- -- - -**Indexing** - -- -- - -**I/O** - -- -- - -**Categorical** - -- -- - -**Timezones** - -- -- - -**Timedelta** - -- -- +This release fixes a build issue with the sdist for Python 3.7 (:issue:`21785`) +There are no other changes. diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt new file mode 100644 index 0000000000000..a88c22e3d01f7 --- /dev/null +++ b/doc/source/whatsnew/v0.23.4.txt @@ -0,0 +1,60 @@ +.. _whatsnew_0234: + +v0.23.4 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + + +.. contents:: What's new in v0.23.4 + :local: + :backlinks: none + +.. _whatsnew_0234.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0234.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) +- + +**Conversion** + +- +- + +**Indexing** + +- +- + +**I/O** + +- +- + +**Categorical** + +- +- + +**Timezones** + +- +- + +**Timedelta** + +- +- From a24750fbff99971ef3a31b610e74c9a0945f2aa0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 09:53:25 -0500 Subject: [PATCH 101/116] DOC: Updated whatsnew.rst --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 0972cc9432f8e..afd274332b3df 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.3.txt + .. include:: whatsnew/v0.23.2.txt .. include:: whatsnew/v0.23.1.txt From edb71fda022c6a155717e7a25679040ee0476639 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 10:09:56 -0500 Subject: [PATCH 102/116] RLS: 0.23.3 From b7a2cd4a4c6ea235005aecbc2911034c6064afd3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 13:57:42 -0500 Subject: [PATCH 103/116] Removed Need for OHLC As First Element if Used in .agg (#21769) (#21794) --- pandas/core/groupby/groupby.py | 6 ++---- pandas/tests/groupby/test_groupby.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..9d227ef37595f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3557,13 +3557,11 @@ def _aggregate_multiple_funcs(self, arg, _level): obj._selection = name results[name] = obj.aggregate(func) - if isinstance(list(compat.itervalues(results))[0], - DataFrame): - + if any(isinstance(x, DataFrame) for x in compat.itervalues(results)): # let higher level handle if _level: return results - return list(compat.itervalues(results))[0] + return DataFrame(results, columns=columns) def _wrap_output(self, output, index, names=None): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..66577d738dd28 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1674,3 +1674,22 @@ def test_tuple_correct_keyerror(): [3, 4]])) with tm.assert_raises_regex(KeyError, "(7, 8)"): df.groupby((7, 8)).mean() + + +def test_groupby_agg_ohlc_non_first(): + # GH 21716 + df = pd.DataFrame([[1], [1]], columns=['foo'], + index=pd.date_range('2018-01-01', periods=2, freq='D')) + + expected = pd.DataFrame([ + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1] + ], columns=pd.MultiIndex.from_tuples(( + ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'), + ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'), + ('foo', 'sum', 'foo'))), index=pd.date_range( + '2018-01-01', periods=2, freq='D')) + + result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) + + tm.assert_frame_equal(result, expected) From 5609eff083baeacbfc80ce9c3a086c7530a7f2b4 Mon Sep 17 00:00:00 2001 From: "meeseeksdev[bot]" Date: Wed, 18 Jul 2018 21:41:02 -0400 Subject: [PATCH 104/116] Backport PR #21921: BUG:Clip with a list-like threshold with a nan is broken (GH19992) (#21967) --- doc/source/whatsnew/v0.23.4.txt | 4 ++++ pandas/core/generic.py | 6 ++++-- pandas/tests/frame/test_analytics.py | 18 ++++++++++++++---- pandas/tests/series/test_analytics.py | 8 ++++++-- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index a88c22e3d01f7..5e19ab491647d 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -58,3 +58,7 @@ Bug Fixes - - + +**Missing** + +- Bug in :func:`Series.clip` and :func:`DataFrame.clip` cannot accept list-like threshold containing ``NaN`` (:issue:`19992`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 02462218e8b02..facc709877285 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6433,9 +6433,11 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, # GH 17276 # numpy doesn't like NaN as a clip value # so ignore - if np.any(pd.isnull(lower)): + # GH 19992 + # numpy doesn't drop a list-like bound containing NaN + if not is_list_like(lower) and np.any(pd.isnull(lower)): lower = None - if np.any(pd.isnull(upper)): + if not is_list_like(upper) and np.any(pd.isnull(upper)): upper = None # GH 2747 (arguments were reversed) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 437d3a9d24730..415ae982673ee 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2195,13 +2195,23 @@ def test_clip_with_na_args(self): """Should process np.nan argument as None """ # GH # 17276 tm.assert_frame_equal(self.frame.clip(np.nan), self.frame) - tm.assert_frame_equal(self.frame.clip(upper=[1, 2, np.nan]), - self.frame) - tm.assert_frame_equal(self.frame.clip(lower=[1, np.nan, 3]), - self.frame) tm.assert_frame_equal(self.frame.clip(upper=np.nan, lower=np.nan), self.frame) + # GH #19992 + df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6], + 'col_2': [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan], + 'col_2': [7, 8, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6], + 'col_2': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + # Matrix-like def test_dot(self): a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1e6ea96a5de51..bcf209521f913 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1140,11 +1140,15 @@ def test_clip_with_na_args(self): s = Series([1, 2, 3]) assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - assert_series_equal(s.clip(upper=[1, 1, np.nan]), Series([1, 2, 3])) - assert_series_equal(s.clip(lower=[1, np.nan, 1]), Series([1, 2, 3])) assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + # GH #19992 + assert_series_equal(s.clip(lower=[0, 4, np.nan]), + Series([1, 4, np.nan])) + assert_series_equal(s.clip(upper=[1, np.nan, 1]), + Series([1, np.nan, 1])) + def test_clip_against_series(self): # GH #6966 From 6a0a95058659cec7515b0233d7795417dfb074fe Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 20 Jul 2018 05:28:01 -0700 Subject: [PATCH 105/116] Backport PR #21966: Fix memory leak in roll_quantile (#21973) --- doc/source/whatsnew/v0.23.4.txt | 1 + pandas/_libs/window.pyx | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index 5e19ab491647d..a30fbc75f11f8 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -27,6 +27,7 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) +- Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) - **Conversion** diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 5121d293efcb6..a77433e5d1115 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1482,6 +1482,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, else: output[i] = NaN + skiplist_destroy(skiplist) + return output From 14e1985f7a34b311cfb57c6f4f1bfe407e64bc75 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Thu, 26 Jul 2018 12:32:29 -0500 Subject: [PATCH 106/116] BUG: rolling with MSVC 2017 build (#21813) * Appveyor 3.7 * ci package list * change image type * try hack fix * lint * use isnan on problem function * use numpy compat isnan * use right isnan * work around OSX math undefs * cleanup const * fix reversion * ... (cherry picked from commit 7a2fbce899aad302891ff9a95aeb1bd55efe533a) --- appveyor.yml | 2 ++ doc/source/whatsnew/v0.23.4.txt | 2 +- pandas/_libs/src/headers/cmath | 1 + pandas/_libs/window.pyx | 21 +++++++++++---------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index f70fc829ec971..c6199c1493f22 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,12 +20,14 @@ environment: matrix: - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index a30fbc75f11f8..7890d199564f6 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) - .. _whatsnew_0234.bug_fixes: diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index d8e2239406cae..2bccf9bb13d77 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -6,6 +6,7 @@ #if defined(_MSC_VER) && (_MSC_VER < 1800) #include namespace std { + __inline int isnan(double x) { return _isnan(x); } __inline int signbit(double num) { return _copysign(1.0, num) < 0; } } #else diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index a77433e5d1115..6954094b46e69 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -14,6 +14,7 @@ cnp.import_array() cdef extern from "../src/headers/cmath" namespace "std": + bint isnan(double) nogil int signbit(double) nogil double sqrt(double x) nogil @@ -654,16 +655,16 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, double *ssqdm_x) nogil: """ add a value from the var calc """ cdef double delta - - # Not NaN - if val == val: - nobs[0] = nobs[0] + 1 - - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] - mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] + # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug + if isnan(val): + return + + nobs[0] = nobs[0] + 1 + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, From 398582616c434330283d82fd029ace7dbd3c6993 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Tue, 17 Jul 2018 14:01:51 +0200 Subject: [PATCH 107/116] DOC add Python2.7 warning to recent whatsnew; include 23.3 (#21944) (cherry picked from commit 4802002ab0564ae384e425c074fde688a228a43f) --- doc/source/whatsnew/v0.23.1.txt | 5 +++++ doc/source/whatsnew/v0.23.2.txt | 4 ++++ doc/source/whatsnew/v0.23.4.txt | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index a52ba22cf36d2..9f8635743ea6a 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -6,6 +6,11 @@ v0.23.1 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + .. contents:: What's new in v0.23.1 :local: :backlinks: none diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index bd86576ad8586..77ad860fc4e8e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -11,6 +11,10 @@ and bug fixes. We recommend that all users upgrade to this version. Pandas 0.23.2 is first pandas release that's compatible with Python 3.7 (:issue:`20552`) +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. .. contents:: What's new in v0.23.2 :local: diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index 7890d199564f6..c17f4ffdd6b8e 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -6,6 +6,10 @@ v0.23.4 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. .. contents:: What's new in v0.23.4 :local: From 12cfef9f80732279687df4ca701967c0ead0a1cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Aug 2018 15:26:40 -0500 Subject: [PATCH 108/116] 0.23.4 whatsnew (#22177) (cherry picked from commit e4381b6e7c3cf1c6f424d01e3dc2613710d79b0d) --- doc/source/whatsnew/v0.23.4.txt | 36 ++------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index c17f4ffdd6b8e..9a3ad3f61ee49 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -1,7 +1,7 @@ .. _whatsnew_0234: -v0.23.4 -------- +v0.23.4 (August 3, 2018) +------------------------ This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -21,7 +21,6 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) -- .. _whatsnew_0234.bug_fixes: @@ -32,37 +31,6 @@ Bug Fixes - Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) - Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) -- - -**Conversion** - -- -- - -**Indexing** - -- -- - -**I/O** - -- -- - -**Categorical** - -- -- - -**Timezones** - -- -- - -**Timedelta** - -- -- **Missing** From b9bacc95c013db0c5cb23a6ddc5496c39668a7c4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 28 Jul 2018 09:16:07 -0400 Subject: [PATCH 109/116] TST: skip pytables test with not-updated pytables conda package (#22099) (cherry picked from commit 017e910a90cbb29c0f844f4d6aa966ebb5cd680a) --- pandas/tests/io/test_pytables.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 7dafc9603f96d..3c6b52074763e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -14,7 +14,7 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, - isna, compat, concat, Timestamp) + isna, compat, concat, Timestamp, _np_version_under1p15) import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -2140,6 +2140,10 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... pytest.raises(TypeError, store.append, 'df_unimplemented', df) + @pytest.mark.skipif( + not _np_version_under1p15, + reason=("pytables conda build package needs build " + "with numpy 1.15: gh-22098")) def test_calendar_roundtrip_issue(self): # 8591 From 0409521665bd436a10aea7e06336066bf07ff057 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Aug 2018 12:19:26 -0500 Subject: [PATCH 110/116] RLS: 0.23.4 From c420e75851361025c8f20c5d00c44c7feef56d5a Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 7 Aug 2018 09:23:03 -0700 Subject: [PATCH 111/116] Added whatsnew for v0.23.5 (#22233) --- doc/source/whatsnew/v0.23.5.txt | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.5.txt diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt new file mode 100644 index 0000000000000..ee0ee4259f86d --- /dev/null +++ b/doc/source/whatsnew/v0.23.5.txt @@ -0,0 +1,39 @@ +.. _whatsnew_0235: + +v0.23.5 (TBD 0, 2018) +--------------------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.5 + :local: + :backlinks: none + +.. _whatsnew_0235.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0235.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- +- + +**Missing** + +- +- From faa199298eaeb1173571da47eaaecaf3b455c7d3 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Aug 2018 08:45:12 -0600 Subject: [PATCH 112/116] Backport PR #22169: BUG: Fix using "inf"/"-inf" in na_values for csv with int index column (#22259) --- doc/source/whatsnew/v0.23.5.txt | 4 ++++ pandas/core/algorithms.py | 4 ++-- pandas/tests/io/parser/na_values.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index ee0ee4259f86d..6a36adb915b3c 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -37,3 +37,7 @@ Bug Fixes - - + +**I/O** + +- Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bcde32696c1ff..9d8d208d2d5c1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -95,7 +95,7 @@ def _ensure_data(values, dtype=None): values = _ensure_float64(values) return values, 'float64', 'float64' - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here return _ensure_object(values), 'object', 'object' @@ -429,7 +429,7 @@ def isin(comps, values): values = values.astype('int64', copy=False) comps = comps.astype('int64', copy=False) f = lambda x, y: htable.ismember_int64(x, y) - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): values = values.astype(object) comps = comps.astype(object) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..cc224efd533b7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,14 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_inf_na_values_with_int_index(self): + # see gh-17128 + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with infs and integer index column + out = self.read_csv(StringIO(data), index_col=[0], + na_values=['inf', '-inf']) + expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, + index=Index([1, 2], name="idx")) + tm.assert_frame_equal(out, expected) From 11c0523f8fffe33131890d6bd2c71f8edacea5c4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Aug 2018 08:45:29 -0600 Subject: [PATCH 113/116] Backport PR #22253: Resampling with NaT in TimedeltaIndex raises MemoryError (#22258) --- doc/source/whatsnew/v0.23.5.txt | 2 +- pandas/core/resample.py | 3 +-- pandas/tests/test_resample.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 6a36adb915b3c..304ab12752ad4 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -30,7 +30,7 @@ Bug Fixes **Groupby/Resample/Rolling** -- +- Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). - **Missing** diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0707cc756682e..e6b9f88c52cd7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1383,8 +1383,7 @@ def _get_time_delta_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax[0] - end = ax[-1] + start, end = ax.min(), ax.max() labels = binner = TimedeltaIndex(start=start, end=end, freq=self.freq, diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c1257cce9a9a4..bcc50a25623a1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2870,6 +2870,16 @@ def test_asfreq_bug(self): freq='1T')) assert_frame_equal(result, expected) + def test_resample_with_nat(self): + # GH 13223 + index = pd.to_timedelta(['0s', pd.NaT, '2s']) + result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean() + expected = DataFrame({'value': [2.5, np.nan, 5.0]}, + index=timedelta_range('0 day', + periods=3, + freq='1S')) + assert_frame_equal(result, expected) + class TestResamplerGrouper(object): From 932de54ac027b9cc8147642ea4448a63fdda33b2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 20 Aug 2018 04:04:28 -0700 Subject: [PATCH 114/116] Backport PR #22424: CI: add missing tzlocal dependency (rpy2, doc build) (#22425) --- ci/travis-36-doc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index c22dddbe0ba3f..8705b82412e7c 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -36,6 +36,7 @@ dependencies: - sphinx - sqlalchemy - statsmodels + - tzlocal - xarray - xlrd - xlsxwriter From 183e92f1309a15e34c890e6c18dd5c7c53f61210 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 11 Sep 2018 09:40:58 -0700 Subject: [PATCH 115/116] CI / BLD: Various CI Backports (#22637) * CI: Bump NumPy to 1.9.3 Backport of gh-22499. * BLD: Fix openpyxl to 2.5.5 Backport of gh-22601. * CI: Resolve timeout issue on Travis Backported from gh-22429. * CI: Migrate to CircleCI 2.0 Backport of gh-21814. * Upgrade Cython to >=0.28.2 Backported from gh-21688. * TST: Patch locale handling Backported from gh-21739. Backport of gh-22213. --- .circleci/config.yml | 147 ++++++++++++++++++ ci/appveyor-27.yaml | 2 +- ci/appveyor-36.yaml | 2 +- ci/circle-27-compat.yaml | 6 +- ci/circle-35-ascii.yaml | 2 +- ci/circle-36-locale.yaml | 2 +- ci/circle-36-locale_slow.yaml | 2 +- ci/install_circle.sh | 19 +-- ci/install_db_circle.sh | 8 - ci/requirements-optional-conda.txt | 2 +- ci/requirements-optional-pip.txt | 4 +- ci/run_circle.sh | 2 +- ci/travis-27-locale.yaml | 2 +- ci/travis-27.yaml | 1 + ci/travis-35-osx.yaml | 2 +- ci/travis-36-doc.yaml | 2 +- ci/travis-36-slow.yaml | 2 +- ci/travis-36.yaml | 2 +- circle.yml | 38 ----- pandas/tests/indexes/datetimes/test_misc.py | 19 ++- pandas/tests/io/json/test_compression.py | 2 + pandas/tests/io/json/test_pandas.py | 2 + pandas/tests/io/parser/test_network.py | 2 + pandas/tests/io/test_excel.py | 1 + .../tests/scalar/timestamp/test_timestamp.py | 20 ++- pandas/tests/series/test_datetime_values.py | 20 ++- pandas/tests/util/test_util.py | 22 ++- pandas/util/testing.py | 32 ++-- 28 files changed, 272 insertions(+), 95 deletions(-) create mode 100644 .circleci/config.yml delete mode 100755 ci/install_db_circle.sh delete mode 100644 circle.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000..e947f30d285cd --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,147 @@ +version: 2 +jobs: + + # -------------------------------------------------------------------------- + # 0. py27_compat + # -------------------------------------------------------------------------- + py27_compat: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + environment: + JOB: "2.7_COMPAT" + ENV_FILE: "ci/circle-27-compat.yaml" + LOCALE_OVERRIDE: "it_IT.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 1. py36_locale + # -------------------------------------------------------------------------- + py36_locale: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE" + ENV_FILE: "ci/circle-36-locale.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 2. py36_locale_slow + # -------------------------------------------------------------------------- + py36_locale_slow: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE_SLOW" + ENV_FILE: "ci/circle-36-locale_slow.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --only-slow --skip-network + + # -------------------------------------------------------------------------- + # 3. py35_ascii + # -------------------------------------------------------------------------- + py35_ascii: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.5_ASCII" + ENV_FILE: "ci/circle-35-ascii.yaml" + LOCALE_OVERRIDE: "C" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + +workflows: + version: 2 + build_and_test: + jobs: + - py27_compat + - py36_locale + - py36_locale_slow + - py35_ascii diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml index 84107c605b14f..e47ebf75344fa 100644 --- a/ci/appveyor-27.yaml +++ b/ci/appveyor-27.yaml @@ -12,7 +12,7 @@ dependencies: - matplotlib - numexpr - numpy=1.10* - - openpyxl + - openpyxl=2.5.5 - pytables==3.2.2 - python=2.7.* - pytz diff --git a/ci/appveyor-36.yaml b/ci/appveyor-36.yaml index 5e370de39958a..d007f04ca0720 100644 --- a/ci/appveyor-36.yaml +++ b/ci/appveyor-36.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pyarrow - pytables - python-dateutil diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml index 81a48d4edf11c..e037877819b14 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/circle-27-compat.yaml @@ -4,11 +4,11 @@ channels: - conda-forge dependencies: - bottleneck=1.0.0 - - cython=0.24 + - cython=0.28.2 - jinja2=2.8 - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - - numpy=1.9.2 - - openpyxl + - numpy=1.9.3 + - openpyxl=2.5.5 - psycopg2 - pytables=3.2.2 - python-dateutil=2.5.0 diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml index 602c414b49bb2..745678791458d 100644 --- a/ci/circle-35-ascii.yaml +++ b/ci/circle-35-ascii.yaml @@ -2,7 +2,7 @@ name: pandas channels: - defaults dependencies: - - cython + - cython>=0.28.2 - nomkl - numpy - python-dateutil diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale.yaml +++ b/ci/circle-36-locale.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/circle-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/install_circle.sh b/ci/install_circle.sh index 5ffff84c88488..f8bcf6bcffc99 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -6,14 +6,7 @@ echo "[home_dir: $home_dir]" echo "[ls -ltr]" ls -ltr -echo "[Using clean Miniconda install]" -rm -rf "$MINICONDA_DIR" - -# install miniconda -wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 -bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - -export PATH="$MINICONDA_DIR/bin:$PATH" +apt-get update -y && apt-get install -y build-essential postgresql-client-9.6 echo "[update conda]" conda config --set ssl_verify false || exit 1 @@ -48,9 +41,17 @@ source $ENVS_FILE # edit the locale override if needed if [ -n "$LOCALE_OVERRIDE" ]; then + + apt-get update && apt-get -y install locales locales-all + + export LANG=$LOCALE_OVERRIDE + export LC_ALL=$LOCALE_OVERRIDE + + python -c "import locale; locale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")" || exit 1; + echo "[Adding locale to the first line of pandas/__init__.py]" rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")\n" sed -i "$sedc" pandas/__init__.py echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh deleted file mode 100755 index a00f74f009f54..0000000000000 --- a/ci/install_db_circle.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "installing dbs" -mysql -e 'create database pandas_nosetest;' -psql -c 'create database pandas_nosetest;' -U postgres - -echo "done" -exit 0 diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e8cfcdf80f2e8..ca60c772392e7 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -11,7 +11,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql pytables diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 877c52fa0b4fd..a6009c270c2a6 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -13,7 +13,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql tables @@ -26,4 +26,4 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 435985bd42148..fc2a8b849a354 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -6,4 +6,4 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index 1312c1296d46a..eacae4630edeb 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.24 - lxml - matplotlib=1.4.3 - - numpy=1.9.2 + - numpy=1.9.3 - openpyxl=2.4.0 - python-dateutil - python-blosc diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 22b993a2da886..26a520a16a4cc 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -27,6 +27,7 @@ dependencies: - PyCrypto - pymysql=0.6.3 - pytables + - blosc=1.14.3 - python-blosc - python-dateutil=2.5.0 - python=2.7* diff --git a/ci/travis-35-osx.yaml b/ci/travis-35-osx.yaml index e74abac4c9775..5722d91781999 100644 --- a/ci/travis-35-osx.yaml +++ b/ci/travis-35-osx.yaml @@ -12,7 +12,7 @@ dependencies: - nomkl - numexpr - numpy=1.10.4 - - openpyxl + - openpyxl=2.5.5 - pytables - python=3.5* - pytz diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index 8705b82412e7c..05ff26020ac7d 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -21,7 +21,7 @@ dependencies: - notebook - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pandoc - pyqt - pytables diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml index 6c475dc48723c..ae6353216cc2d 100644 --- a/ci/travis-36-slow.yaml +++ b/ci/travis-36-slow.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - patsy - psycopg2 - pymysql diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 006276ba1a65f..83f963b9d9b6d 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -17,7 +17,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pyarrow - pymysql diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 66415defba6fe..0000000000000 --- a/circle.yml +++ /dev/null @@ -1,38 +0,0 @@ -machine: - environment: - # these are globally set - MINICONDA_DIR: /home/ubuntu/miniconda3 - - -database: - override: - - ./ci/install_db_circle.sh - - -checkout: - post: - # since circleci does a shallow fetch - # we need to populate our tags - - git fetch --depth=1000 - - -dependencies: - override: - - > - case $CIRCLE_NODE_INDEX in - 0) - sudo apt-get install language-pack-it && ./ci/install_circle.sh JOB="2.7_COMPAT" ENV_FILE="ci/circle-27-compat.yaml" LOCALE_OVERRIDE="it_IT.UTF-8" ;; - 1) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE" ENV_FILE="ci/circle-36-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 2) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE_SLOW" ENV_FILE="ci/circle-36-locale_slow.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 3) - ./ci/install_circle.sh JOB="3.5_ASCII" ENV_FILE="ci/circle-35-ascii.yaml" LOCALE_OVERRIDE="C" ;; - esac - - ./ci/show_circle.sh - - -test: - override: - - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: - parallel: true diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 056924f2c6663..743cbc107cce5 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,5 +1,6 @@ import locale import calendar +import unicodedata import pytest @@ -7,7 +8,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (Index, DatetimeIndex, datetime, offsets, - date_range, Timestamp) + date_range, Timestamp, compat) class TestTimeSeries(object): @@ -284,10 +285,24 @@ def test_datetime_name_accessors(self, time_locale): dti = DatetimeIndex(freq='M', start='2012', end='2013') result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes + # https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_index_equal(result, expected) + for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 05ceace20f5a4..1b9cbc57865d2 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -2,6 +2,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.util.testing import assert_frame_equal, assert_raises_regex @@ -31,6 +32,7 @@ def test_read_zipped_json(datapath): assert_frame_equal(uncompressed_df, compressed_df) +@td.skip_if_not_us_locale def test_with_s3_url(compression): boto3 = pytest.importorskip('boto3') pytest.importorskip('s3fs') diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index bcbac4400c953..b5a2be87de1c4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,7 @@ assert_series_equal, network, ensure_clean, assert_index_equal) import pandas.util.testing as tm +import pandas.util._test_decorators as td _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -1040,6 +1041,7 @@ def test_read_inline_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_resource): # GH17200 diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index e2243b8087a5b..72d2c5fd8d18f 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -55,10 +55,12 @@ def tips_df(datapath): @pytest.mark.usefixtures("s3_resource") +@td.skip_if_not_us_locale() class TestS3(object): def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') + # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4e2b2af0ebfe7..20f403e71fd36 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -576,6 +576,7 @@ def test_read_from_http_url(self, ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_no('s3fs') + @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext): boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4689c7bea626f..e829506e95b53 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -5,6 +5,7 @@ import dateutil import calendar import locale +import unicodedata import numpy as np from dateutil.tz import tzutc @@ -20,7 +21,7 @@ from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz from pandas.errors import OutOfBoundsDatetime -from pandas.compat import long, PY3 +from pandas.compat import long, PY3, PY2 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta, NaT @@ -116,8 +117,21 @@ def test_names(self, data, time_locale): expected_day = calendar.day_name[0].capitalize() expected_month = calendar.month_name[8].capitalize() - assert data.day_name(time_locale) == expected_day - assert data.month_name(time_locale) == expected_month + result_day = data.day_name(time_locale) + result_month = data.month_name(time_locale) + + # Work around https://github.com/pandas-dev/pandas/issues/22342 + # different normalizations + + if not PY2: + expected_day = unicodedata.normalize("NFD", expected_day) + expected_month = unicodedata.normalize("NFD", expected_month) + + result_day = unicodedata.normalize("NFD", result_day,) + result_month = unicodedata.normalize("NFD", result_month) + + assert result_day == expected_day + assert result_month == expected_month # Test NaT nan_ts = Timestamp(NaT) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 47798d0ddd7f5..5e924ac5c8894 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -3,6 +3,7 @@ import locale import calendar +import unicodedata import pytest from datetime import datetime, date @@ -13,7 +14,8 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range, - PeriodIndex, DatetimeIndex, TimedeltaIndex) + PeriodIndex, DatetimeIndex, TimedeltaIndex, + compat) import pandas.core.common as com from pandas.util.testing import assert_series_equal @@ -309,10 +311,24 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): s = Series(DatetimeIndex(freq='M', start='2012', end='2013')) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) + + # work around https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_series_equal(result, expected) + for s_date, expected in zip(s, expected_months): result = s_date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", expected) + + assert result == expected + s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 145be7f85b193..c049dfc874940 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -433,6 +433,26 @@ def teardown_class(cls): del cls.locales del cls.current_locale + def test_can_set_locale_valid_set(self): + # Setting the default locale should return True + assert tm.can_set_locale('') is True + + def test_can_set_locale_invalid_set(self): + # Setting an invalid locale should return False + assert tm.can_set_locale('non-existent_locale') is False + + def test_can_set_locale_invalid_get(self, monkeypatch): + # In some cases, an invalid locale can be set, + # but a subsequent getlocale() raises a ValueError + # See GH 22129 + + def mockgetlocale(): + raise ValueError() + + with monkeypatch.context() as m: + m.setattr(locale, 'getlocale', mockgetlocale) + assert tm.can_set_locale('') is False + def test_get_locales(self): # all systems should have at least a single locale assert len(tm.get_locales()) > 0 @@ -466,7 +486,7 @@ def test_set_locale(self): enc = codecs.lookup(enc).name new_locale = lang, enc - if not tm._can_set_locale(new_locale): + if not tm.can_set_locale(new_locale): with pytest.raises(locale.Error): with tm.set_locale(new_locale): pass diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b7edbff00a4b9..bb79c25126fab 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -478,6 +478,8 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): A string of the form .. For example to set the current locale to US English with a UTF8 encoding, you would pass "en_US.UTF-8". + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Notes ----- @@ -489,37 +491,37 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): try: locale.setlocale(lc_var, new_locale) - - try: - normalized_locale = locale.getlocale() - except ValueError: - yield new_locale + normalized_locale = locale.getlocale() + if com._all_not_none(*normalized_locale): + yield '.'.join(normalized_locale) else: - if com._all_not_none(*normalized_locale): - yield '.'.join(normalized_locale) - else: - yield new_locale + yield new_locale finally: locale.setlocale(lc_var, current_locale) -def _can_set_locale(lc): - """Check to see if we can set a locale without throwing an exception. +def can_set_locale(lc, lc_var=locale.LC_ALL): + """ + Check to see if we can set a locale, and subsequently get the locale, + without raising an Exception. Parameters ---------- lc : str The locale to attempt to set. + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Returns ------- - isvalid : bool + is_valid : bool Whether the passed locale can be set """ try: - with set_locale(lc): + with set_locale(lc, lc_var=lc_var): pass - except locale.Error: # horrible name for a Exception subclass + except (ValueError, + locale.Error): # horrible name for a Exception subclass return False else: return True @@ -546,7 +548,7 @@ def _valid_locales(locales, normalize): else: normalizer = lambda x: x.strip() - return list(filter(_can_set_locale, map(normalizer, locales))) + return list(filter(can_set_locale, map(normalizer, locales))) # ----------------------------------------------------------------------------- # Stdout / stderr decorators From af7b0ba461a5b81733afdc7fc816a869b798093d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 11 Sep 2018 14:45:25 -0700 Subject: [PATCH 116/116] BUG: NaN should have pct rank of NaN (#22634) Backport of gh-22600. --- doc/source/whatsnew/v0.23.5.txt | 3 +++ pandas/_libs/groupby_helper.pxi.in | 7 ++++++- pandas/tests/groupby/test_rank.py | 19 ++++++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 304ab12752ad4..f69e38e7fdd50 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -20,6 +20,9 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ +- Calling :meth:`DataFrameGroupBy.rank` and :meth:`SeriesGroupBy.rank` with empty groups + and ``pct=True`` was raising a ``ZeroDivisionError`` due to `c1068d9 + `_ (:issue:`22519`) - - diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b3e9b7c9e69ee..d7885e112a7e0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -587,7 +587,12 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if pct: for i in range(N): - out[i, 0] = out[i, 0] / grp_sizes[i, 0] + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + else: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 203c3c73bec94..d978e144e5013 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -1,7 +1,7 @@ import pytest import numpy as np import pandas as pd -from pandas import DataFrame, concat +from pandas import DataFrame, Series, concat from pandas.util import testing as tm @@ -252,3 +252,20 @@ def test_rank_object_raises(ties_method, ascending, na_option, df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({ + "A": [0, 1, 0], + "B": [1., np.nan, 2.] + }) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected)