diff --git a/.travis.yml b/.travis.yml index e5e05ed26da56..b7c18d2850a15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -123,7 +123,7 @@ after_success: after_script: - echo "after_script start" - - source activate pandas && python -c "import pandas; pandas.show_versions();" + - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e /tmp/single.xml ]; then ci/print_skipped.py /tmp/single.xml; fi diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c0c3a42cc4464..13b5cd2b06032 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -368,6 +368,11 @@ def setup(self): self.dates = (np.datetime64('now') + self.offsets) self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + N = 1000000 + self.draws = pd.Series(np.random.randn(N)) + labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + self.cats = labels.astype('category') + def time_groupby_multi_size(self): self.df.groupby(['key1', 'key2']).size() @@ -377,6 +382,10 @@ def time_groupby_dt_size(self): def time_groupby_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_groupby_size(self): + self.draws.groupby(self.cats).size() + + #---------------------------------------------------------------------- # groupby with a variable value for ngroups diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 8947a0fdd796c..6a2c9d48c4a28 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,6 +19,9 @@ def time_getitem_list_like(self): def time_getitem_array(self): self.s[np.arange(10000)] + def time_getitem_lists(self): + self.s[np.arange(10000).tolist()] + def time_iloc_array(self): self.s.iloc[np.arange(10000)] @@ -190,9 +193,15 @@ def setup(self): np.arange(1000)], names=['one', 'two']) import string - self.mistring = MultiIndex.from_product( - [np.arange(1000), - np.arange(20), list(string.ascii_letters)], + + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): @@ -215,8 +224,26 @@ def time_multiindex_get_indexer(self): (0, 16), (0, 17), (0, 18), (0, 19)], dtype=object)) + def time_multiindex_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + + def time_multiindex_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): - self.mistring.get_loc((999, 19, 'Z')) + self.mi_small.get_loc((99, 'A', 'A')) + + def time_multiindex_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) def time_is_monotonic(self): self.miint.is_monotonic diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index c66654ee1e006..3c0e2869357ae 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -111,6 +111,7 @@ def setup(self): def time_series_dropna_int64(self): self.s.dropna() + class series_dropna_datetime(object): goal_time = 0.2 @@ -120,3 +121,13 @@ def setup(self): def time_series_dropna_datetime(self): self.s.dropna() + + +class series_clip(object): + goal_time = 0.2 + + def setup(self): + self.s = pd.Series(np.random.randn(50)) + + def time_series_dropna_datetime(self): + self.s.clip(0, 1) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 601edded29f5a..8cf6f2ce636da 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -119,15 +119,7 @@ if [ "$COVERAGE" ]; then fi echo -if [ "$BUILD_TEST" ]; then - - # build & install testing - echo ["Starting installation test."] - bash ci/install_release_build.sh - conda uninstall -y cython - time pip install dist/*tar.gz || exit 1 - -else +if [ -z "$BUILD_TEST" ]; then # build but don't install echo "[build em]" @@ -163,9 +155,22 @@ fi # w/o removing anything else echo echo "[removing installed pandas]" -conda remove pandas --force +conda remove pandas -y --force -if [ -z "$BUILD_TEST" ]; then +if [ "$BUILD_TEST" ]; then + + # remove any installation + pip uninstall -y pandas + conda list pandas + pip list --format columns |grep pandas + + # build & install testing + echo ["building release"] + bash scripts/build_dist_for_release.sh + conda uninstall -y cython + time pip install dist/*tar.gz || exit 1 + +else # install our pandas echo diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh index cfbd2882a8a2d..39ea1a0cf67bf 100644 --- a/ci/requirements-3.5_OSX.sh +++ b/ci/requirements-3.5_OSX.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 35_OSX" -conda install -n pandas -c conda-forge feather-format +conda install -n pandas -c conda-forge feather-format==0.3.1 diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 663d2feb5be23..d79fc43fbe175 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -19,20 +19,26 @@ export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 429496 echo PYTHONHASHSEED=$PYTHONHASHSEED if [ "$BUILD_TEST" ]; then - echo "build-test" + echo "[build-test]" + + echo "[env]" + pip list --format columns |grep pandas + + echo "[running]" cd /tmp - pwd - conda list pandas - echo "running" - python -c "import pandas; pandas.test(['-n 2'])" + unset PYTHONPATH + python -c 'import pandas; pandas.test(["-n 2", "--skip-slow", "--skip-network", "-r xX", "-m not single"])' + elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" + elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + else - echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/ci/script_single.sh b/ci/script_single.sh index db637679f0e0f..245b4e6152c4d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -20,8 +20,8 @@ elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/doc/make.py b/doc/make.py index e70655c3e2f92..781347a3c3e1b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -34,39 +34,52 @@ SPHINX_BUILD = 'sphinxbuild' -def upload_dev(user='pandas'): +def _process_user(user): + if user is None or user is False: + user = '' + else: + user = user + '@' + return user + + +def upload_dev(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): raise SystemExit('Upload to Pydata Dev failed') -def upload_dev_pdf(user='pandas'): +def upload_dev_pdf(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): raise SystemExit('PDF upload to Pydata Dev failed') -def upload_stable(user='pandas'): +def upload_stable(user=None): 'push a copy to the pydata stable directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): raise SystemExit('Upload to stable failed') -def upload_stable_pdf(user='pandas'): +def upload_stable_pdf(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): raise SystemExit('PDF upload to stable failed') -def upload_prev(ver, doc_root='./', user='pandas'): +def upload_prev(ver, doc_root='./', user=None): 'push a copy of older release to appropriate version directory' + user = _process_user(user) local_dir = doc_root + 'build/html' remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . %s@pandas.pydata.org:%s -essh' + cmd = 'cd %s; rsync -avz . %spandas.pydata.org:%s -essh' cmd = cmd % (local_dir, user, remote_dir) print(cmd) if os.system(cmd): @@ -74,7 +87,7 @@ def upload_prev(ver, doc_root='./', user='pandas'): 'Upload to %s from %s failed' % (remote_dir, local_dir)) local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf %s@pandas.pydata.org:%s' + pdf_cmd = 'cd %s; scp pandas.pdf %spandas.pydata.org:%s' pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) if os.system(pdf_cmd): raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) diff --git a/doc/source/api.rst b/doc/source/api.rst index cb5136df1ff8b..e7d12df56d260 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -724,6 +724,7 @@ Serialization / IO / Conversion Series.to_dense Series.to_string Series.to_clipboard + Series.to_latex Sparse ~~~~~~ diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index a508e84465107..ef558381c5e6f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -453,6 +453,14 @@ the original values: np.asarray(cat) > base +When you compare two unordered categoricals with the same categories, the order is not considered: + +.. ipython:: python + + c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 == c2 + Operations ---------- diff --git a/doc/source/install.rst b/doc/source/install.rst index 578caae605471..48d51e1200447 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -202,7 +202,7 @@ installed), make sure you have `pytest Dependencies ------------ -* `setuptools `__ +* `setuptools `__ * `NumPy `__: 1.7.1 or higher * `python-dateutil `__: 1.5 or higher * `pytz `__: Needed for time zone support diff --git a/doc/source/style.ipynb b/doc/source/style.ipynb index 427b18b988aef..4eeda491426b1 100644 --- a/doc/source/style.ipynb +++ b/doc/source/style.ipynb @@ -12,7 +12,7 @@ "\n", "*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*\n", "\n", - "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/html-styling.ipynb).\n", + "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n", "\n", "You can apply **conditional formatting**, the visual styling of a DataFrame\n", "depending on the data within, by using the ``DataFrame.style`` property.\n", diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html index ddf1e861f5f81..a2106605c5562 100644 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -94,4 +94,15 @@

{{ _('Search') }}

}); }); + {% endblock %} \ No newline at end of file diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d6fb1c6a8f9cc..ffaeeb78c2799 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.20.2.txt + .. include:: whatsnew/v0.20.0.txt .. include:: whatsnew/v0.19.2.txt diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 504f8004bc8a6..07ab637dd29f5 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -1,6 +1,6 @@ -.. _whatsnew_0201: +.. _whatsnew_0202: -v0.20.1 (???) +v0.20.2 (???) ------------- This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, @@ -9,77 +9,98 @@ We recommend that all users upgrade to this version. Highlights include: -.. contents:: What's new in v0.20.1 +.. contents:: What's new in v0.20.2 :local: :backlinks: none -.. _whatsnew_0201.enhancements: +.. _whatsnew_0202.enhancements: Enhancements ~~~~~~~~~~~~ +- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) +- ``Series`` provides a ``to_latex`` method (:issue:`16180`) - -.. _whatsnew_0201.performance: +.. _whatsnew_0202.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance regression fix when indexing with a list-like (:issue:`16285`) +- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) +- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) +- Improved performance of groupby with categorical groupers (:issue:`16413`) - -.. _whatsnew_0201.bug_fixes: +.. _whatsnew_0202.bug_fixes: Bug Fixes ~~~~~~~~~ +- Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) +- Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) +- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) + Conversion ^^^^^^^^^^ - +- Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`) +- Silence numpy warnings when broadcasting DataFrame to Series with comparison ops (:issue:`16378`, :issue:`16306`) Indexing ^^^^^^^^ - +- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) I/O ^^^ - +- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) +- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) +- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) Plotting ^^^^^^^^ +- Bug in ``DataFrame.plot`` with a single column and a list-like ``color`` (:issue:`3486`) +- Bug in ``plot`` where ``NaT`` in ``DatetimeIndex`` results in ``Timestamp.min`` (:issue: `12405`) +- Bug in ``DataFrame.boxplot`` where ``figsize`` keyword was not respected for non-grouped boxplots (:issue:`11959`) + Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) Sparse ^^^^^^ - - +- Bug in construction of SparseDataFrame from ``scipy.sparse.dok_matrix`` (:issue:`16179`) Reshaping ^^^^^^^^^ - - +- Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`) +- Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`) +- Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) +- Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) Numeric ^^^^^^^ +- Bug in .interpolate(), where limit_direction was not respected when limit=None (default) was passed (:issue:16282) +Categorical +^^^^^^^^^^^ - +- Fixed comparison operations considering the order of the categories when both categoricals are unordered (:issue:`16014`) Other ^^^^^ + +- Bug in ``pd.drop([])`` for DataFrame with non-unique indices (:issue:`16270`) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..a0b9e9e47463c 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..003fd12fab6cd 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +from lib cimport is_null_datetimelike + + #---------------------------------------------------------------------- # VectorData #---------------------------------------------------------------------- @@ -889,6 +892,19 @@ cdef class MultiIndexHashTable(HashTable): "hash collision\nlocs:\n{}\n" "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): + # validate that the loc maps to the actual value + # version of _check_for_collisions above for single label (tuple) + + result = self.mi[loc] + + if not all(l == r or (is_null_datetimelike(l) + and is_null_datetimelike(r)) + for l, r in zip(result, label)): + raise AssertionError( + "hash collision\nloc:\n{}\n" + "result:\n{}\nmi:\n{}".format(loc, result, label)) + def __contains__(self, object key): try: self.get_item(key) @@ -907,8 +923,7 @@ cdef class MultiIndexHashTable(HashTable): k = kh_get_uint64(self.table, value) if k != self.table.n_buckets: loc = self.table.vals[k] - locs = np.array([loc], dtype=np.int64) - self._check_for_collisions(locs, key) + self._check_for_collision(loc, key) return loc else: raise KeyError(key) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c7a537acf5d6f..21680fb0b3921 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) -cdef class MultiIndexEngine(IndexEngine): +cdef class MultiIndexObjectEngine(ObjectEngine): + """ + provide the same interface as the MultiIndexEngine + but use the IndexEngine for computation + + This provides good performance with samller MI's + """ + def get_indexer(self, values): + # convert a MI to an ndarray + if hasattr(values, 'values'): + values = values.values + return super(MultiIndexObjectEngine, self).get_indexer(values) + + cpdef get_loc(self, object val): + + # convert a MI to an ndarray + if hasattr(val, 'values'): + val = val.values + return super(MultiIndexObjectEngine, self).get_loc(val) + + +cdef class MultiIndexHashEngine(ObjectEngine): + """ + Use a hashing based MultiIndex impl + but use the IndexEngine for computation + + This provides good performance with larger MI's + """ def _call_monotonic(self, object mi): # defer these back to the mi iteself @@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine): except TypeError: raise KeyError(val) + def get_indexer(self, values): + self._ensure_mapping_populated() + return self.mapping.lookup(values) + cdef _make_hash_table(self, n): return _hash.MultiIndexHashTable(n) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 31402c38c770d..f6e574b66a828 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -950,7 +950,6 @@ def clean_index_list(list obj): Utility used in pandas.core.index._ensure_index """ cdef: - ndarray[object] converted Py_ssize_t i, n = len(obj) object v bint all_arrays = 1 @@ -964,15 +963,20 @@ def clean_index_list(list obj): if all_arrays: return obj, all_arrays - converted = np.empty(n, dtype=object) - for i in range(n): - v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data'): - converted[i] = tuple(v) - else: - converted[i] = v + # don't force numpy coerce with nan's + inferred = infer_dtype(obj) + if inferred in ['string', 'bytes', 'unicode', + 'mixed', 'mixed-integer']: + return np.asarray(obj, dtype=object), 0 + elif inferred in ['integer']: + + # TODO: we infer an integer but it *could* be a unint64 + try: + return np.asarray(obj, dtype='int64'), 0 + except OverflowError: + return np.asarray(obj, dtype='object'), 0 - return maybe_convert_objects(converted), 0 + return np.asarray(obj), 0 ctypedef fused pandas_string: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index d87a0641291b1..ddd38979e326c 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -947,8 +947,13 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, ------- numeric_array : array of converted object values to numerical ones """ + + if len(values) == 0: + return np.array([], dtype='i8') + # fastpath for ints - try to convert all based on first value cdef object val = values[0] + if util.is_integer_object(val): try: maybe_ints = values.astype('i8') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 6b0775e54da0c..be23ebb023383 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; } else if (!IS_WHITESPACE(c)) { self->state = START_FIELD; // fall through to subsequent state diff --git a/pandas/conftest.py b/pandas/conftest.py index 1149fae3fc0b0..8a3ffe22242ac 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -45,3 +45,13 @@ def spmatrix(request): tm._skip_if_no_scipy() from scipy import sparse return getattr(sparse, request.param + '_matrix') + + +@pytest.fixture +def ip(): + """An instance of IPython.InteractiveShell. + Will raise a skip if IPython is not installed. + """ + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.interactiveshell import InteractiveShell + return InteractiveShell() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a745ec616eda8..77d79c9585e57 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -388,7 +388,7 @@ def isin(comps, values): "[{0}]".format(type(values).__name__)) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): - values = np.array(list(values), dtype='object') + values = lib.list_to_object_array(list(values)) comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 7eb86232cbb07..a5e61797bd478 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -55,17 +55,31 @@ def f(self, other): "equality or not") if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are - # the same - if ((len(self.categories) != len(other.categories)) or - not ((self.categories == other.categories).all())): - raise TypeError("Categoricals can only be compared if " - "'categories' are the same") + # the same (maybe up to ordering, depending on ordered) + + msg = ("Categoricals can only be compared if " + "'categories' are the same.") + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif (self.ordered and not (self.categories == + other.categories).all()): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): + raise TypeError(msg) + if not (self.ordered == other.ordered): raise TypeError("Categoricals can only be compared if " "'ordered' is the same") - na_mask = (self._codes == -1) | (other._codes == -1) + if not self.ordered and not self.categories.equals( + other.categories): + # both unordered and different order + other_codes = _get_codes_for_values(other, self.categories) + else: + other_codes = other._codes + + na_mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) - ret = f(other._codes) + ret = f(other_codes) if na_mask.any(): # In other series, the leads to False, so do that here too ret[na_mask] = False @@ -328,6 +342,13 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): self._categories = categories self._codes = coerce_indexer_dtype(codes, categories) + def __dir__(self): + # Avoid IPython warnings for deprecated properties + # https://github.com/pandas-dev/pandas/issues/16409 + rv = set(dir(type(self))) + rv.discard("labels") + return sorted(rv) + @property def _constructor(self): return Categorical diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 19d3792f73de7..fd61813a57c98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -837,7 +837,7 @@ def try_timedelta(v): try: return to_timedelta(v)._values.reshape(shape) except: - return v + return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 561f1951a4151..dc2c56ea476f9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -88,12 +88,12 @@ def is_dtype(cls, dtype): """ if hasattr(dtype, 'dtype'): dtype = dtype.dtype - if isinstance(dtype, cls): - return True - elif isinstance(dtype, np.dtype): + if isinstance(dtype, np.dtype): return False elif dtype is None: return False + elif isinstance(dtype, cls): + return True try: return cls.construct_from_string(dtype) is not None except: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e6ea58e7e05be..22f73490335f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -620,16 +620,6 @@ def _repr_html_(self): else: return None - def _repr_latex_(self): - """ - Returns a LaTeX representation for a particular Dataframe. - Mainly for use with nbconvert (jupyter notebook conversion to pdf). - """ - if get_option('display.latex.repr'): - return self.to_latex() - else: - return None - @property def style(self): """ @@ -1596,94 +1586,6 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() - @Substitution(header='Write out column names. If a list of string is given, \ -it is assumed to be aliases for the column names.') - @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=True, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): - r""" - Render a DataFrame to a tabular environment table. You can splice - this into a LaTeX document. Requires \usepackage{booktabs}. - - `to_latex`-specific options: - - bold_rows : boolean, default True - Make the row labels bold in the output - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 - columns - longtable : boolean, default will be read from the pandas config module - Default: False. - Use a longtable environment instead of tabular. Requires adding - a \usepackage{longtable} to your LaTeX preamble. - escape : boolean, default will be read from the pandas config module - Default: True. - When set to False prevents from escaping latex special - characters in column names. - encoding : str, default None - A string representing the encoding to use in the output file, - defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - decimal : string, default '.' - Character recognized as decimal separator, e.g. ',' in Europe. - - .. versionadded:: 0.18.0 - - multicolumn : boolean, default True - Use \multicolumn to enhance MultiIndex columns. - The default will be read from the config module. - - .. versionadded:: 0.20.0 - - multicolumn_format : str, default 'l' - The alignment for multicolumns, similar to `column_format` - The default will be read from the config module. - - .. versionadded:: 0.20.0 - - multirow : boolean, default False - Use \multirow to enhance MultiIndex rows. - Requires adding a \usepackage{multirow} to your LaTeX preamble. - Will print centered labels (instead of top-aligned) - across the contained rows, separating groups via clines. - The default will be read from the pandas config module. - - .. versionadded:: 0.20.0 - - """ - # Get defaults from the pandas config - if longtable is None: - longtable = get_option("display.latex.longtable") - if escape is None: - escape = get_option("display.latex.escape") - if multicolumn is None: - multicolumn = get_option("display.latex.multicolumn") - if multicolumn_format is None: - multicolumn_format = get_option("display.latex.multicolumn_format") - if multirow is None: - multirow = get_option("display.latex.multirow") - - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - header=header, index=index, - formatters=formatters, - float_format=float_format, - bold_rows=bold_rows, - sparsify=sparsify, - index_names=index_names, - escape=escape, decimal=decimal) - formatter.to_latex(column_format=column_format, longtable=longtable, - encoding=encoding, multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) - - if buf is None: - return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ @@ -3012,12 +2914,12 @@ def _maybe_casted_values(index, labels=None): return values new_index = _default_index(len(new_obj)) - if isinstance(self.index, MultiIndex): - if level is not None: - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.index._get_level_number(lev) for lev in level] - if len(level) < len(self.index.levels): + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: new_index = self.index.droplevel(level) if not drop: @@ -3033,6 +2935,8 @@ def _maybe_casted_values(index, labels=None): multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): + if not (level is None or i in level): + continue name = names[i] if multi_col: col_name = (list(name) if isinstance(name, tuple) @@ -3049,11 +2953,9 @@ def _maybe_casted_values(index, labels=None): missing = self.columns.nlevels - len(name_lst) name_lst += [col_fill] * missing name = tuple(name_lst) - # to ndarray and maybe infer different dtype level_values = _maybe_casted_values(lev, lab) - if level is None or i in level: - new_obj.insert(0, name, level_values) + new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: @@ -3852,13 +3754,13 @@ def update(self, other, join='left', overwrite=True, filter_func=None, if overwrite: mask = isnull(that) - - # don't overwrite columns unecessarily - if mask.all(): - continue else: mask = notnull(this) + # don't overwrite columns unecessarily + if mask.all(): + continue + self[col] = expressions.where(mask, this, that, raise_on_error=True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 27a489293db8f..1a1bbc37cd816 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,8 +12,10 @@ from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _ensure_int64, + _ensure_object, needs_i8_conversion, is_scalar, + is_number, is_integer, is_bool, is_bool_dtype, is_numeric_dtype, @@ -44,7 +46,7 @@ import pandas.core.common as com import pandas.core.missing as missing from pandas.io.formats.printing import pprint_thing -from pandas.io.formats.format import format_percentiles +from pandas.io.formats.format import format_percentiles, DataFrameFormatter from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv @@ -1049,6 +1051,16 @@ def __setstate__(self, state): # ---------------------------------------------------------------------- # IO + def _repr_latex_(self): + """ + Returns a LaTeX representation for a particular object. + Mainly for use with nbconvert (jupyter notebook conversion to pdf). + """ + if config.get_option('display.latex.repr'): + return self.to_latex() + else: + return None + # ---------------------------------------------------------------------- # I/O Methods @@ -1266,12 +1278,17 @@ def to_hdf(self, path_or_buf, key, **kwargs): `__. Applicable only to format='table'. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum dropna : boolean, default False. @@ -1382,8 +1399,8 @@ def to_clipboard(self, excel=None, sep=None, **kwargs): - Windows: none - OS X: none """ - from pandas.io.clipboard import clipboard - clipboard.to_clipboard(self, excel=excel, sep=sep, **kwargs) + from pandas.io import clipboards + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) def to_xarray(self): """ @@ -1482,6 +1499,100 @@ def to_xarray(self): coords=coords, ) + _shared_docs['to_latex'] = """ + Render an object to a tabular environment table. You can splice + this into a LaTeX document. Requires \\usepackage{booktabs}. + + .. versionchanged:: 0.20.2 + Added to Series + + `to_latex`-specific options: + + bold_rows : boolean, default True + Make the row labels bold in the output + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 + columns + longtable : boolean, default will be read from the pandas config module + Default: False. + Use a longtable environment instead of tabular. Requires adding + a \\usepackage{longtable} to your LaTeX preamble. + escape : boolean, default will be read from the pandas config module + Default: True. + When set to False prevents from escaping latex special + characters in column names. + encoding : str, default None + A string representing the encoding to use in the output file, + defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + decimal : string, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + + .. versionadded:: 0.18.0 + + multicolumn : boolean, default True + Use \multicolumn to enhance MultiIndex columns. + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multirow : boolean, default False + Use \multirow to enhance MultiIndex rows. + Requires adding a \\usepackage{multirow} to your LaTeX preamble. + Will print centered labels (instead of top-aligned) + across the contained rows, separating groups via clines. + The default will be read from the pandas config module. + + .. versionadded:: 0.20.0 + """ + + @Substitution(header='Write out column names. If a list of string is given, \ +it is assumed to be aliases for the column names.') + @Appender(_shared_docs['to_latex'] % _shared_doc_kwargs) + def to_latex(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, bold_rows=True, + column_format=None, longtable=None, escape=None, + encoding=None, decimal='.', multicolumn=None, + multicolumn_format=None, multirow=None): + # Get defaults from the pandas config + if self.ndim == 1: + self = self.to_frame() + if longtable is None: + longtable = config.get_option("display.latex.longtable") + if escape is None: + escape = config.get_option("display.latex.escape") + if multicolumn is None: + multicolumn = config.get_option("display.latex.multicolumn") + if multicolumn_format is None: + multicolumn_format = config.get_option( + "display.latex.multicolumn_format") + if multirow is None: + multirow = config.get_option("display.latex.multirow") + + formatter = DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + header=header, index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape, decimal=decimal) + formatter.to_latex(column_format=column_format, longtable=longtable, + encoding=encoding, multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow) + + if buf is None: + return formatter.buf.getvalue() + # ---------------------------------------------------------------------- # Fancy Indexing @@ -2056,7 +2167,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): result = dropped else: - labels = com._index_labels_to_array(labels) + labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') @@ -4099,6 +4210,26 @@ def isnull(self): def notnull(self): return notnull(self).__finalize__(self) + def _clip_with_scalar(self, lower, upper): + + if ((lower is not None and np.any(isnull(lower))) or + (upper is not None and np.any(isnull(upper)))): + raise ValueError("Cannot use an NA value as a clip threshold") + + result = self.values + mask = isnull(result) + + with np.errstate(all='ignore'): + if upper is not None: + result = np.where(result >= upper, upper, result) + if lower is not None: + result = np.where(result <= lower, lower, result) + if np.any(mask): + result[mask] = np.nan + + return self._constructor( + result, **self._construct_axes_dict()).__finalize__(self) + def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): """ Trim values at input threshold(s). @@ -4117,12 +4248,13 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): Examples -------- >>> df - 0 1 + 0 1 0 0.335232 -1.256177 1 -1.367855 0.746646 2 0.027753 -1.176076 3 0.230930 -0.679613 4 1.261967 0.570967 + >>> df.clip(-1.0, 0.5) 0 1 0 0.335232 -1.000000 @@ -4130,6 +4262,7 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 2 0.027753 -1.000000 3 0.230930 -0.679613 4 0.500000 0.500000 + >>> t 0 -0.3 1 -0.2 @@ -4137,6 +4270,7 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 3 0.0 4 0.1 dtype: float64 + >>> df.clip(t, t + 1, axis=0) 0 1 0 0.335232 -0.300000 @@ -4155,6 +4289,11 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): if is_scalar(lower) and is_scalar(upper): lower, upper = min(lower, upper), max(lower, upper) + # fast-path for scalars + if ((lower is None or (is_scalar(lower) and is_number(lower))) and + (upper is None or (is_scalar(upper) and is_number(upper)))): + return self._clip_with_scalar(lower, upper) + result = self if lower is not None: result = result.clip_lower(lower, axis) @@ -4184,6 +4323,9 @@ def clip_upper(self, threshold, axis=None): if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") + if is_scalar(threshold) and is_number(threshold): + return self._clip_with_scalar(None, threshold) + subset = self.le(threshold, axis=axis) | isnull(self) return self.where(subset, threshold, axis=axis) @@ -4208,6 +4350,9 @@ def clip_lower(self, threshold, axis=None): if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") + if is_scalar(threshold) and is_number(threshold): + return self._clip_with_scalar(threshold, None) + subset = self.ge(threshold, axis=axis) | isnull(self) return self.where(subset, threshold, axis=axis) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 82f3bf3b15462..2af4f112ca941 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None): if tolerance is not None: raise ValueError('tolerance argument only valid if using pad, ' 'backfill or nearest lookups') - key = _values_from_object(key) try: return self._engine.get_loc(key) except KeyError: @@ -3960,7 +3959,7 @@ def _ensure_index(index_like, copy=False): if isinstance(index_like, list): if type(index_like) != list: index_like = list(index_like) - # 2200 ? + converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7ef037d8f3536..569e16f2141ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -75,7 +75,6 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] - _engine_type = libindex.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -629,7 +628,16 @@ def _get_level_number(self, level): @cache_readonly def _engine(self): - return self._engine_type(lambda: self, len(self)) + + # choose our engine based on our size + # the hashing based MultiIndex for larger + # sizes, and the MultiIndexOjbect for smaller + # xref: https://github.com/pandas-dev/pandas/pull/16324 + l = len(self) + if l > 10000: + return libindex.MultiIndexHashEngine(lambda: self, l) + + return libindex.MultiIndexObjectEngine(lambda: self.values, l) @property def values(self): @@ -740,7 +748,7 @@ def _hashed_indexing_key(self, key): we need to stringify if we have mixed levels """ - from pandas.core.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples, hash_tuple if not isinstance(key, tuple): return hash_tuples(key) @@ -754,7 +762,7 @@ def f(k, stringify): return k key = tuple([f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels)]) - return hash_tuples(key) + return hash_tuple(key) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3010348423340..51778684d68f5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -160,35 +160,41 @@ def _interp_limit(invalid, fw_limit, bw_limit): start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) - # This is a list of the indexes in the series whose yvalue is currently - # NaN, but whose interpolated yvalue will be overwritten with NaN after - # computing the interpolation. For each index in this list, one of these - # conditions is true of the corresponding NaN in the yvalues: + # violate_limit is a list of the indexes in the series whose yvalue is + # currently NaN, and should still be NaN after the interpolation. + # Specifically: # - # a) It is one of a chain of NaNs at the beginning of the series, and - # either limit is not specified or limit_direction is 'forward'. - # b) It is one of a chain of NaNs at the end of the series, and limit is - # specified and limit_direction is 'backward' or 'both'. - # c) Limit is nonzero and it is further than limit from the nearest non-NaN - # value (with respect to the limit_direction setting). + # If limit_direction='forward' or None then the list will contain NaNs at + # the beginning of the series, and NaNs that are more than 'limit' away + # from the prior non-NaN. # - # The default behavior is to fill forward with no limit, ignoring NaNs at - # the beginning (see issues #9218 and #10420) - violate_limit = sorted(start_nans) - - if limit is not None: - if not is_integer(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - if limit_direction == 'forward': - violate_limit = sorted(start_nans | set(_interp_limit(invalid, - limit, 0))) - if limit_direction == 'backward': - violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, - limit))) - if limit_direction == 'both': - violate_limit = sorted(_interp_limit(invalid, limit, limit)) + # If limit_direction='backward' then the list will contain NaNs at + # the end of the series, and NaNs that are more than 'limit' away + # from the subsequent non-NaN. + # + # If limit_direction='both' then the list will contain NaNs that + # are more than 'limit' away from any non-NaN. + # + # If limit=None, then use default behavior of filling an unlimited number + # of NaNs in the direction specified by limit_direction + + # default limit is unlimited GH #16282 + if limit is None: + limit = len(xvalues) + elif not is_integer(limit): + raise ValueError('Limit must be an integer') + elif limit < 1: + raise ValueError('Limit must be greater than 0') + + # each possible limit_direction + if limit_direction == 'forward': + violate_limit = sorted(start_nans | + set(_interp_limit(invalid, limit, 0))) + elif limit_direction == 'backward': + violate_limit = sorted(end_nans | + set(_interp_limit(invalid, 0, limit))) + elif limit_direction == 'both': + violate_limit = sorted(_interp_limit(invalid, limit, limit)) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e7cfbdb0fc9c6..55473ec8d7cad 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1250,7 +1250,8 @@ def _flex_comp_method_FRAME(op, name, str_rep=None, default_axis='columns', masker=False): def na_op(x, y): try: - result = op(x, y) + with np.errstate(invalid='ignore'): + result = op(x, y) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=bool) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 631b91c3aad11..2bb825541e23b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -184,6 +184,12 @@ def __getattr__(self, attr): matches_pattern = any(attr.startswith(x) for x in self._deprecated_valid_patterns) if not matches_pattern and attr not in self._deprecated_valids: + # avoid the warning, if it's just going to be an exception + # anyway. + if not hasattr(self.obj, attr): + raise AttributeError("'{}' has no attribute '{}'".format( + type(self.obj).__name__, attr + )) self = self._deprecated(attr) return object.__getattribute__(self, attr) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 779002b300cc7..f944dfe22361a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -689,7 +689,7 @@ def _convert_level_number(level_num, columns): new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(frame.columns.levels[level_num]) + new_levels.append(level_vals) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) @@ -1046,6 +1046,9 @@ def melt_stub(df, stub, i, j, value_vars, sep): else: i = list(i) + if df[i].duplicated().any(): + raise ValueError("the id variables need to uniquely identify each row") + value_vars = list(map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 3c8f6e8c6257d..461dd50c5da6e 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -190,8 +190,8 @@ def _init_spmatrix(self, data, index, columns, dtype=None, values = Series(data.data, index=data.row, copy=False) for col, rowvals in values.groupby(data.col): # get_blocks expects int32 row indices in sorted order + rowvals = rowvals.sort_index() rows = rowvals.index.values.astype(np.int32) - rows.sort() blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 6a5343e8a8e25..e41ffae9d03c2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -4,17 +4,17 @@ import itertools import numpy as np -from pandas._libs import hashing -from pandas._libs.lib import is_bool_array +from pandas._libs import hashing, tslib from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, ABCSeries, ABCDataFrame) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_list_like) + is_categorical_dtype, is_list_like) +from pandas.core.dtypes.missing import isnull +from pandas.core.dtypes.cast import infer_dtype_from_scalar + # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -136,7 +136,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): ------- ndarray of hashed values array """ - is_tuple = False if isinstance(vals, tuple): vals = [vals] @@ -168,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h +def hash_tuple(val, encoding='utf8', hash_key=None): + """ + Hash a single tuple efficiently + + Parameters + ---------- + val : single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + hash + + """ + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) + for v in val) + + h = _combine_hash_arrays(hashes, len(val))[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -231,6 +253,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") + dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key @@ -238,22 +261,21 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. - if is_categorical_dtype(vals.dtype): + if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early - if np.issubdtype(vals.dtype, np.complex128): + elif np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if is_bool_array(vals): + elif isinstance(dtype, np.bool): vals = vals.astype('u8') - elif (is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals)): + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view('i8').astype('u8', copy=False) - elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, @@ -280,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals + + +def _hash_scalar(val, encoding='utf8', hash_key=None): + """ + Hash scalar value + + Returns + ------- + 1d uint64 numpy array of hash value, of length 1 + """ + + if isnull(val): + # this is to be consistent with the _hash_categorical implementation + return np.array([np.iinfo(np.uint64).max], dtype='u8') + + if getattr(val, 'tzinfo', None) is not None: + # for tz-aware datetimes, we need the underlying naive UTC value and + # not the tz aware object or pd extension type (as + # infer_dtype_from_scalar would do) + if not isinstance(val, tslib.Timestamp): + val = tslib.Timestamp(val) + val = val.tz_convert(None) + + dtype, val = infer_dtype_from_scalar(val) + vals = np.array([val], dtype=dtype) + + return hash_array(vals, hash_key=hash_key, encoding=encoding, + categorize=False) diff --git a/pandas/core/window.py b/pandas/core/window.py index df8e0c05009f4..cf1bad706ae1d 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1074,7 +1074,7 @@ def validate(self): super(Rolling, self).validate() # we allow rolling on a datetimelike index - if (self.is_datetimelike and + if ((self.obj.empty or self.is_datetimelike) and isinstance(self.window, (compat.string_types, DateOffset, timedelta))): diff --git a/pandas/io/api.py b/pandas/io/api.py index 7f0d3c3631f63..a4a25b78942db 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -5,7 +5,7 @@ # flake8: noqa from pandas.io.parsers import read_csv, read_table, read_fwf -from pandas.io.clipboard.clipboard import read_clipboard +from pandas.io.clipboards import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, get_store, read_hdf from pandas.io.json import read_json diff --git a/pandas/io/clipboard/clipboard.py b/pandas/io/clipboards.py similarity index 100% rename from pandas/io/clipboard/clipboard.py rename to pandas/io/clipboards.py diff --git a/pandas/io/common.py b/pandas/io/common.py index 28f90972f95de..14ac4d366fcef 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -314,6 +314,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles = list() f = path_or_buf + + # Convert pathlib.Path/py.path.local or string + path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) if compression: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 65098bb2aa404..183d8d9d87d0b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -32,7 +32,8 @@ OrderedDict, unichr) from pandas.io.formats.terminal import get_terminal_size from pandas.core.config import get_option, set_option -from pandas.io.common import _get_handle, UnicodeWriter, _expand_user +from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user, + _stringify_path) from pandas.io.formats.printing import adjoin, justify, pprint_thing from pandas.io.formats.common import get_level_lengths import pandas.core.common as com @@ -1475,7 +1476,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if path_or_buf is None: path_or_buf = StringIO() - self.path_or_buf = _expand_user(path_or_buf) + self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index eac82ddde2318..3d7e0fcdc69b3 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -238,24 +238,25 @@ def format_attr(pair): "class": " ".join(cs), "is_visible": True}) - for c, value in enumerate(clabels[r]): - cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] - cs.extend(cell_context.get( - "col_headings", {}).get(r, {}).get(c, [])) - es = { - "type": "th", - "value": value, - "display_value": value, - "class": " ".join(cs), - "is_visible": _is_visible(c, r, col_lengths), - } - colspan = col_lengths.get((r, c), 0) - if colspan > 1: - es["attributes"] = [ - format_attr({"key": "colspan", "value": colspan}) - ] - row_es.append(es) - head.append(row_es) + if clabels: + for c, value in enumerate(clabels[r]): + cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] + cs.extend(cell_context.get( + "col_headings", {}).get(r, {}).get(c, [])) + es = { + "type": "th", + "value": value, + "display_value": value, + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + } + colspan = col_lengths.get((r, c), 0) + if colspan > 1: + es["attributes"] = [ + format_attr({"key": "colspan", "value": colspan}) + ] + row_es.append(es) + head.append(row_es) if self.data.index.names and not all(x is None for x in self.data.index.names): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 17bedd016f617..f017421c1f83a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -402,12 +402,17 @@ class HDFStore(StringMixin): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum @@ -430,9 +435,10 @@ def __init__(self, path, mode=None, complevel=None, complib=None, raise ImportError('HDFStore requires PyTables, "{ex}" problem ' 'importing'.format(ex=str(ex))) - if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'): - raise ValueError("complib only supports 'blosc', 'bzip2', lzo' " - "or 'zlib' compression.") + if complib is not None and complib not in tables.filters.all_complibs: + raise ValueError( + "complib only supports {libs} compression.".format( + libs=tables.filters.all_complibs)) self._path = path if mode is None: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index e88979b14c8af..ec7c1f02f2ee8 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -48,9 +48,10 @@ def _get_standard_kind(kind): return {'density': 'kde'}.get(kind, kind) -def _gca(): +def _gca(rc=None): import matplotlib.pyplot as plt - return plt.gca() + with plt.rc_context(rc): + return plt.gca() def _gcf(): @@ -180,7 +181,8 @@ def _validate_color_args(self): colors = self.kwds.pop('colors') self.kwds['color'] = colors - if ('color' in self.kwds and self.nseries == 1): + if ('color' in self.kwds and self.nseries == 1 and + not is_list_like(self.kwds['color'])): # support series.plot(color='green') self.kwds['color'] = [self.kwds['color']] @@ -1868,12 +1870,6 @@ def plot_series(data, kind='line', ax=None, # Series unique **kwds): import matplotlib.pyplot as plt - """ - If no axes is specified, check whether there are existing figures - If there is no existing figures, _gca() will - create a figure with the default figsize, causing the figsize=parameter to - be ignored. - """ if ax is None and len(plt.get_fignums()) > 0: ax = _gca() ax = MPLPlot._get_ax_layer(ax) @@ -2003,7 +1999,8 @@ def plot_group(keys, values, ax): "'by' is None") if ax is None: - ax = _gca() + rc = {'figure.figsize': figsize} if figsize is not None else {} + ax = _gca(rc) data = data._get_numeric_data() if columns is None: columns = data.columns diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index e92724a5d9cd4..767e99d98cf29 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex +from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -213,6 +213,17 @@ def test_maybe_convert_scalar(self): result = maybe_convert_scalar(Timedelta('1 day 1 min')) assert result == Timedelta('1 day 1 min').value + def test_maybe_infer_to_datetimelike(self): + # GH16362 + # pandas=0.20.1 raises IndexError: tuple index out of range + result = DataFrame(np.array([[NaT, 'a', 'b', 0], + [NaT, 'b', 'c', 1]])) + assert result.size == 8 + # this construction was fine + result = DataFrame(np.array([[NaT, 'a', 0], + [NaT, 'b', 1]])) + assert result.size == 6 + class TestConvert(object): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e6313dfc602a8..fbfbcc14e9150 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -641,6 +641,43 @@ def test_reset_index(self): xp = xp.set_index(['B'], append=True) assert_frame_equal(rs, xp, check_names=False) + def test_reset_index_level(self): + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=['A', 'B', 'C', 'D']) + + for levels in ['A', 'B'], [0, 1]: + # With MultiIndex + result = df.set_index(['A', 'B']).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = df.set_index(['A', 'B']).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = df.set_index(['A', 'B']).reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A', 'B']).reset_index(level=levels, + drop=True) + tm.assert_frame_equal(result, df[['C', 'D']]) + + # With single-level Index (GH 16263) + result = df.set_index('A').reset_index(level=levels[0]) + tm.assert_frame_equal(result, df) + + result = df.set_index('A').reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A']).reset_index(level=levels[0], + drop=True) + tm.assert_frame_equal(result, df[['B', 'C', 'D']]) + + # Missing levels - for both MultiIndex and single-level Index: + for idx_lev in ['A', 'B'], ['A']: + with tm.assert_raises_regex(KeyError, 'Level E '): + df.set_index(idx_lev).reset_index(level=['A', 'E']) + with tm.assert_raises_regex(IndexError, 'Too many levels'): + df.set_index(idx_lev).reset_index(level=[0, 1, 2]) + def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) s1 = Series((9.81 * time ** 2) / 2, diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index be89b27912d1c..818c1fc574551 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1201,6 +1201,14 @@ def test_isin_df(self): expected['B'] = False tm.assert_frame_equal(result, expected) + def test_isin_tuples(self): + # GH16394 + df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + df['C'] = list(zip(df['A'], df['B'])) + result = df['C'].isin([(1, 'a')]) + tm.assert_series_equal(result, + Series([True, False, False], name="C")) + def test_isin_df_dupe_values(self): df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) # just cols duped @@ -1824,6 +1832,17 @@ def test_dataframe_clip(self): assert (clipped_df.values[ub_mask] == ub).all() assert (clipped_df.values[mask] == df.values[mask]).all() + @pytest.mark.xfail(reason=("clip on mixed integer or floats " + "with integer clippers coerces to float")) + def test_clip_mixed_numeric(self): + + df = DataFrame({'A': [1, 2, 3], + 'B': [1., np.nan, 3.]}) + result = df.clip(1, 2) + expected = DataFrame({'A': [1, 2, 2], + 'B': [1., np.nan, 2.]}) + tm.assert_frame_equal(result, expected, check_like=True) + def test_clip_against_series(self): # GH #6966 @@ -2045,3 +2064,16 @@ def test_n_duplicate_index(self, df_duplicates, n, order): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) + + def test_series_broadcasting(self): + # smoke test for numpy warnings + # GH 16378, GH 16306 + df = DataFrame([1.0, 1.0, 1.0]) + df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]}) + s = Series([1, 1, 1]) + s_nan = Series([np.nan, np.nan, 1]) + + with tm.assert_produces_warning(None): + df_nan.clip_lower(s, axis=0) + for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: + getattr(df, op)(s_nan, axis=0) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index a6326083c1bee..87d942101f5f1 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -61,6 +61,11 @@ def test_drop_names(self): expected = Index(['e', 'f'], name='second') tm.assert_index_equal(dropped.columns, expected) + # GH 16398 + dropped = df.drop([], errors='ignore') + expected = Index(['a', 'b', 'c'], name='first') + tm.assert_index_equal(dropped.index, expected) + def test_drop_col_still_multiindex(self): arrays = [['a', 'b', 'c', 'top'], ['', '', '', 'OD'], @@ -100,6 +105,7 @@ def test_drop(self): columns=['a', 'a', 'b']) assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) + assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 688cacdee263e..e82faaeef2986 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -763,3 +763,26 @@ def test_concat_datetime_datetime64_frame(self): # it works! pd.concat([df1, df2_obj]) + + +class TestDataFrameUpdate(TestData): + + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) + df2 = DataFrame({'A': [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({'A': [1.0, None, 3], + 'B': date_range('2000', periods=3)}) + df2 = DataFrame({'A': [None, 2, 3]}) + expected = DataFrame({'A': [1.0, 2, 3], + 'B': date_range('2000', periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 44f95266b6c78..c07856dc63602 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -29,32 +29,6 @@ def test_css_parse_normalisation(name, norm, abnorm): assert_same_resolution(norm, abnorm) -@pytest.mark.xfail(reason='CSS comments not yet stripped') -def test_css_parse_comments(): - assert_same_resolution('hello: world', - 'hello/* foo */:/* bar \n */ world /*;not:here*/') - - -@pytest.mark.xfail(reason='''we don't need to handle specificity - markers like !important, but we should - ignore them in the future''') -def test_css_parse_specificity(): - assert_same_resolution('font-weight: bold', 'font-weight: bold !important') - - -@pytest.mark.xfail(reason='Splitting CSS declarations not yet sensitive to ' - '; in CSS strings') -def test_css_parse_strings(): - # semicolons in strings - with tm.assert_produces_warning(CSSWarning): - assert_resolves( - 'background-image: url(\'http://blah.com/foo?a;b=c\')', - {'background-image': 'url(\'http://blah.com/foo?a;b=c\')'}) - assert_resolves( - 'background-image: url("http://blah.com/foo?a;b=c")', - {'background-image': 'url("http://blah.com/foo?a;b=c")'}) - - @pytest.mark.parametrize( 'invalid_css,remainder', [ # No colon @@ -62,15 +36,7 @@ def test_css_parse_strings(): ('border-style: solid; hello-world', 'border-style: solid'), ('border-style: solid; hello-world; font-weight: bold', 'border-style: solid; font-weight: bold'), - # Unclosed string - pytest.mark.xfail(('background-image: "abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('font-family: "abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('background-image: \'abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('font-family: \'abc', ''), - reason='Unclosed CSS strings not detected'), + # Unclosed string fail # Invalid size ('font-size: blah', 'font-size: 1em'), ('font-size: 1a2b', 'font-size: 1em'), @@ -124,46 +90,6 @@ def test_css_side_shorthands(shorthand, expansions): {}) -@pytest.mark.xfail(reason='CSS font shorthand not yet handled') -@pytest.mark.parametrize('css,props', [ - ('font: italic bold 12pt helvetica,sans-serif', - {'font-family': 'helvetica,sans-serif', - 'font-style': 'italic', - 'font-weight': 'bold', - 'font-size': '12pt'}), - ('font: bold italic 12pt helvetica,sans-serif', - {'font-family': 'helvetica,sans-serif', - 'font-style': 'italic', - 'font-weight': 'bold', - 'font-size': '12pt'}), -]) -def test_css_font_shorthand(css, props): - assert_resolves(css, props) - - -@pytest.mark.xfail(reason='CSS background shorthand not yet handled') -@pytest.mark.parametrize('css,props', [ - ('background: blue', {'background-color': 'blue'}), - ('background: fixed blue', - {'background-color': 'blue', 'background-attachment': 'fixed'}), -]) -def test_css_background_shorthand(css, props): - assert_resolves(css, props) - - -@pytest.mark.xfail(reason='CSS border shorthand not yet handled') -@pytest.mark.parametrize('style,equiv', [ - ('border: 1px solid red', - 'border-width: 1px; border-style: solid; border-color: red'), - ('border: solid red 1px', - 'border-width: 1px; border-style: solid; border-color: red'), - ('border: red solid', - 'border-style: solid; border-color: red'), -]) -def test_css_border_shorthand(style, equiv): - assert_same_resolution(style, equiv) - - @pytest.mark.parametrize('style,inherited,equiv', [ ('margin: 1px; margin: 2px', '', 'margin: 2px'), diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index ee7356f12f498..9911888f758fb 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -103,6 +103,16 @@ def test_render(self): s.render() # it worked? + def test_render_empty_dfs(self): + empty_df = DataFrame() + es = Styler(empty_df) + es.render() + # An index but no columns + DataFrame(columns=['a']).style.render() + # A column but no index + DataFrame(index=['a']).style.render() + # No IndexError raised? + def test_render_double(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red; border: 1px", diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index fff5299921270..cdff3b8a5cca8 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -73,14 +73,7 @@ ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}), ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}), ('text-shadow: 0px -2em', {'font': {'shadow': True}}), - pytest.mark.xfail(('text-shadow: #CCC 3px 3px 3px', - {'font': {'shadow': True}}), - reason='text-shadow with color preceding width not yet ' - 'identified as shadow'), - pytest.mark.xfail(('text-shadow: #999 0px 0px 0px', - {'font': {'shadow': False}}), - reason='text-shadow with color preceding zero width not ' - 'yet identified as non-shadow'), + # FILL # - color, fillType ('background-color: red', {'fill': {'fgColor': 'FF0000', @@ -209,11 +202,3 @@ def test_css_to_excel_multiple(): def test_css_to_excel_inherited(css, inherited, expected): convert = CSSToExcelConverter(inherited) assert expected == convert(css) - - -@pytest.mark.xfail(reason='We are not currently warning for all unconverted ' - 'CSS, but possibly should') -def test_css_to_excel_warns_when_not_supported(): - convert = CSSToExcelConverter() - with pytest.warns(UserWarning): - convert('background: red') diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 2542deb0cedf1..4ee77abb32c26 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -from pandas import DataFrame, compat +from pandas import DataFrame, compat, Series from pandas.util import testing as tm from pandas.compat import u import codecs @@ -491,3 +491,18 @@ def test_to_latex_decimal(self, frame): """ assert withindex_result == withindex_expected + + def test_to_latex_series(self): + s = Series(['a', 'b', 'c']) + withindex_result = s.to_latex() + withindex_expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & a \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + assert withindex_result == withindex_expected diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 86b0e5a0c6a2d..59d908638a244 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -25,7 +25,7 @@ else partial(json.dumps, encoding="utf-8")) -class UltraJSONTests(object): +class TestUltraJSONTests(object): @pytest.mark.skipif(compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865") @@ -946,19 +946,19 @@ def my_obj_handler(obj): ujson.decode(ujson.encode(l, default_handler=str))) -class NumpyJSONTests(object): +class TestNumpyJSONTests(object): - def testBool(self): + def test_Bool(self): b = np.bool(True) assert ujson.decode(ujson.encode(b)) == b - def testBoolArray(self): + def test_BoolArray(self): inpt = np.array([True, False, True, True, False, True, False, False], dtype=np.bool) outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) tm.assert_numpy_array_equal(inpt, outp) - def testInt(self): + def test_Int(self): num = np.int(2562010) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -986,7 +986,7 @@ def testInt(self): num = np.uint64(2562010) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testIntArray(self): + def test_IntArray(self): arr = np.arange(100, dtype=np.int) dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, np.uint, np.uint8, np.uint16, np.uint32, np.uint64) @@ -995,7 +995,7 @@ def testIntArray(self): outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) tm.assert_numpy_array_equal(inpt, outp) - def testIntMax(self): + def test_IntMax(self): num = np.int(np.iinfo(np.int).max) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -1025,7 +1025,7 @@ def testIntMax(self): num = np.uint64(np.iinfo(np.int64).max) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testFloat(self): + def test_Float(self): num = np.float(256.2013) assert np.float(ujson.decode(ujson.encode(num))) == num @@ -1035,7 +1035,7 @@ def testFloat(self): num = np.float64(256.2013) assert np.float64(ujson.decode(ujson.encode(num))) == num - def testFloatArray(self): + def test_FloatArray(self): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) dtypes = (np.float, np.float32, np.float64) @@ -1045,7 +1045,7 @@ def testFloatArray(self): inpt, double_precision=15)), dtype=dtype) tm.assert_almost_equal(inpt, outp) - def testFloatMax(self): + def test_FloatMax(self): num = np.float(np.finfo(np.float).max / 10) tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) @@ -1058,7 +1058,7 @@ def testFloatMax(self): tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) - def testArrays(self): + def test_Arrays(self): arr = np.arange(100) arr = arr.reshape((10, 10)) @@ -1099,13 +1099,13 @@ def testArrays(self): outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) tm.assert_almost_equal(arr, outp) - def testOdArray(self): + def test_OdArray(self): def will_raise(): ujson.encode(np.array(1)) pytest.raises(TypeError, will_raise) - def testArrayNumpyExcept(self): + def test_ArrayNumpyExcept(self): input = ujson.dumps([42, {}, 'a']) try: @@ -1188,7 +1188,7 @@ def testArrayNumpyExcept(self): except: assert False, "Wrong exception" - def testArrayNumpyLabelled(self): + def test_ArrayNumpyLabelled(self): input = {'a': []} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() @@ -1222,9 +1222,9 @@ def testArrayNumpyLabelled(self): assert (np.array(['a', 'b']) == output[2]).all() -class PandasJSONTests(object): +class TestPandasJSONTests(object): - def testDataFrame(self): + def test_DataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1254,7 +1254,7 @@ def testDataFrame(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNumpy(self): + def test_DataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1277,7 +1277,7 @@ def testDataFrameNumpy(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNested(self): + def test_DataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1303,7 +1303,7 @@ def testDataFrameNested(self): 'df2': ujson.decode(ujson.encode(df, orient="split"))} assert ujson.decode(ujson.encode(nested, orient="split")) == exp - def testDataFrameNumpyLabelled(self): + def test_DataFrameNumpyLabelled(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1326,7 +1326,7 @@ def testDataFrameNumpyLabelled(self): tm.assert_index_equal(df.columns, outp.columns) tm.assert_index_equal(df.index, outp.index) - def testSeries(self): + def test_Series(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1374,7 +1374,7 @@ def testSeries(self): s, orient="index"), numpy=True)).sort_values() tm.assert_series_equal(outp, exp) - def testSeriesNested(self): + def test_SeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1400,7 +1400,7 @@ def testSeriesNested(self): 's2': ujson.decode(ujson.encode(s, orient="index"))} assert ujson.decode(ujson.encode(nested, orient="index")) == exp - def testIndex(self): + def test_Index(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 3e7a648474bc3..56ac10404b7b2 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,6 +7,8 @@ further arguments when parsing. """ +import sys + import pytest import numpy as np @@ -417,3 +419,30 @@ def test_data_after_quote(self): expected = DataFrame({'a': ['1', 'ba']}) tm.assert_frame_equal(result, expected) + + @tm.capture_stderr + def test_comment_whitespace_delimited(self): + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert 'Skipping line {}'.format(line_num) in error, error + expected = DataFrame([[1, 2], + [5, 2], + [6, 2], + [7, np.nan], + [8, np.nan]]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index bcce0c6d020ae..31d815a4bca97 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -679,6 +679,19 @@ def test_file(self): tm.assert_frame_equal(url_table, local_table) + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_csv, + lambda p: self.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath( + df.to_csv, + lambda p: self.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + def test_nonexistent_path(self): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index a5157744038f4..7070c3c7c9382 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -3,6 +3,7 @@ import pandas.util.testing as tm import os import io +import pytest import numpy as np @@ -65,6 +66,29 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() + @pytest.mark.xfail(reason="read_sas currently doesn't work with pathlib") + def test_path_pathlib(self): + tm._skip_if_no_pathlib() + from pathlib import Path + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = Path(os.path.join(self.dirpath, "test%d.sas7bdat" % k)) + df = pd.read_sas(fname, encoding='utf-8') + tm.assert_frame_equal(df, df0) + + @pytest.mark.xfail(reason="read_sas currently doesn't work with localpath") + def test_path_localpath(self): + tm._skip_if_no_localpath() + from py.path import local as LocalPath + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = LocalPath(os.path.join(self.dirpath, + "test%d.sas7bdat" % k)) + df = pd.read_sas(fname, encoding='utf-8') + tm.assert_frame_equal(df, df0) + def test_iterator_loop(self): # github #13654 for j in 0, 1: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index c70b5937fea3f..b4a5b24616728 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1858,6 +1858,16 @@ def test_freeze_panes(self): result = read_excel(path) tm.assert_frame_equal(expected, result) + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_excel, pd.read_excel) + tm.assert_frame_equal(df, result) + + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_excel, pd.read_excel) + tm.assert_frame_equal(df, result) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 232bb126d9d67..96df05aa096e4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,6 +9,11 @@ from feather import FeatherError from pandas.util.testing import assert_frame_equal, ensure_clean +import pandas.util.testing as tm +from distutils.version import LooseVersion + + +fv = LooseVersion(feather.__version__) @pytest.mark.single @@ -56,6 +61,7 @@ def test_basic(self): assert df.dttz.dtype.tz.zone == 'US/Eastern' self.check_round_trip(df) + @pytest.mark.skipif(fv >= '0.4.0', reason='fixed in 0.4.0') def test_strided_data_issues(self): # strided data issuehttps://github.com/wesm/feather/issues/97 @@ -75,12 +81,10 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) + @pytest.mark.skipif(fv >= '0.4.0', reason='fixed in 0.4.0') def test_unsupported(self): - # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) - self.check_error_on_write(df, ValueError) - + # timedelta df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)}) self.check_error_on_write(df, FeatherError) @@ -88,6 +92,12 @@ def test_unsupported(self): df = pd.DataFrame({'a': ['a', 1, 2.0]}) self.check_error_on_write(df, ValueError) + def test_unsupported_other(self): + + # period + df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + self.check_error_on_write(df, ValueError) + def test_write_with_index(self): df = pd.DataFrame({'A': [1, 2, 3]}) @@ -114,3 +124,15 @@ def test_write_with_index(self): df.index = [0, 1, 2] df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), self.check_error_on_write(df, ValueError) + + @pytest.mark.xfail(reason="feather currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_pathlib(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="feather currently doesn't work with localpath") + def test_path_localpath(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_localpath(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6da77bf423609..1e1d653cf94d1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -20,7 +20,7 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows) + is_platform_windows, PY3) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas._libs.parsers import ParserError @@ -96,6 +96,9 @@ def read_html(self, *args, **kwargs): class TestReadHtml(ReadHtmlMixin): flavor = 'bs4' spam_data = os.path.join(DATA_PATH, 'spam.html') + spam_data_kwargs = {} + if PY3: + spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') @classmethod @@ -247,10 +250,10 @@ def test_infer_types(self): assert_framelist_equal(df1, df2) def test_string_io(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data1 = StringIO(f.read()) - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, '.*Water.*') @@ -258,7 +261,7 @@ def test_string_io(self): assert_framelist_equal(df1, df2) def test_string(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() df1 = self.read_html(data, '.*Water.*') @@ -267,10 +270,10 @@ def test_string(self): assert_framelist_equal(df1, df2) def test_file_like(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df1 = self.read_html(f, '.*Water.*') - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 4b1145129c364..fd42becca3ac3 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -134,6 +134,18 @@ def test_string_io(self): result = read_msgpack(p) tm.assert_frame_equal(result, df) + @pytest.mark.xfail(reason="msgpack currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="msgpack currently doesn't work with localpath") + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_msgpack, read_msgpack) + tm.assert_frame_equal(df, result) + def test_iterator_with_string_io(self): dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 875b5bd3055b9..429ec5ba1c474 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -299,6 +299,18 @@ def test_pickle_v0_15_2(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) +def test_pickle_path_pathlib(): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + +def test_pickle_path_localpath(): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + # --------------------- # test pickle compression # --------------------- diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ee44fea55e51a..bb29425ff4942 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -734,6 +734,39 @@ def test_put_compression_blosc(self): store.put('c', df, format='table', complib='blosc') tm.assert_frame_equal(store['c'], df) + def test_complibs(self): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version('lzo'): + all_complibs.remove('lzo') + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(self.path) as tmpfile: + gname = 'foo' + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode='r') + for node in h5table.walk_nodes(where='/' + gname, + classname='Leaf'): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) @@ -4249,6 +4282,49 @@ def test_select_filter_corner(self): result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) + def test_path_pathlib(self): + df = tm.makeDataFrame() + + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, 'df'), + lambda p: pd.read_hdf(p, 'df')) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason='pathlib currently doesnt work with HDFStore') + def test_path_pathlib_hdfstore(self): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, 'df') + + def reader(path): + with pd.HDFStore(path) as store: + pd.read_hdf(store, 'df') + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) + + def test_pickle_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, 'df'), + lambda p: pd.read_hdf(p, 'df')) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason='localpath currently doesnt work with HDFStore') + def test_path_localpath_hdfstore(self): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, 'df') + + def reader(path): + with pd.HDFStore(path) as store: + pd.read_hdf(store, 'df') + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} @@ -4939,8 +5015,8 @@ def test_invalid_complib(self): index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: - pytest.raises(ValueError, df.to_hdf, path, - 'df', complib='blosc:zlib') + with pytest.raises(ValueError): + df.to_hdf(path, 'df', complib='foolib') # GH10443 def test_read_nokey(self): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4c92c19c51e7a..4ec990116bb62 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1283,3 +1283,15 @@ def test_invalid_encoding(self): with pytest.raises(ValueError): with tm.ensure_clean() as path: original.to_stata(path, encoding='utf-8') + + @pytest.mark.xfail(reason="stata currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_stata, read_stata) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="stata currently doesn't work with localpath") + def test_pickle_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_stata, read_stata) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 1e06c13980657..547dd0154de4e 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -160,6 +160,14 @@ def test_boxplot_empty_column(self): df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type='axes') + @slow + def test_figsize(self): + df = DataFrame(np.random.rand(10, 5), + columns=['A', 'B', 'C', 'D', 'E']) + result = df.boxplot(return_type='axes', figsize=(12, 8)) + assert result.figure.bbox_inches.width == 12 + assert result.figure.bbox_inches.height == 8 + def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) self._check_ticks_props(df.boxplot("a", fontsize=16), diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4a4a71d7ea639..2de8c9acff98c 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -153,6 +153,11 @@ def test_mpl2_color_cycle_str(self): else: pytest.skip("not supported in matplotlib < 2.0.0") + def test_color_single_series_list(self): + # GH 3486 + df = DataFrame({"A": [1, 2, 3]}) + _check_plot_works(df.plot, color=['red']) + def test_color_empty_string(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 79626d89026a7..d47a95924bd10 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -976,3 +976,14 @@ def test_multiple_id_columns(self): exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') tm.assert_frame_equal(long_frame, exp_frame) + + def test_non_unique_idvars(self): + # GH16382 + # Raise an error message if non unique id vars (i) are passed + df = pd.DataFrame({ + 'A_A1': [1, 2, 3, 4, 5], + 'B_B1': [1, 2, 3, 4, 5], + 'x': [1, 1, 1, 1, 1] + }) + with pytest.raises(ValueError): + wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 150767ee9e2b2..98ae749aaa10e 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -141,6 +141,45 @@ def test_reset_index(self): tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series) + def test_reset_index_level(self): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], + columns=['A', 'B', 'C']) + + for levels in ['A', 'B'], [0, 1]: + # With MultiIndex + s = df.set_index(['A', 'B'])['C'] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = s.reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A', 'B']).reset_index(level=levels, + drop=True) + tm.assert_frame_equal(result, df[['C']]) + + with tm.assert_raises_regex(KeyError, 'Level E '): + s.reset_index(level=['A', 'E']) + + # With single-level Index + s = df.set_index('A')['B'] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df[['A', 'B']]) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df[['A', 'B']]) + + result = s.reset_index(level=levels[0], drop=True) + tm.assert_series_equal(result, df['B']) + + with tm.assert_raises_regex(IndexError, 'Too many levels'): + s.reset_index(level=[0, 1, 2]) + def test_reset_index_range(self): # GH 12071 s = pd.Series(range(2), name='A', dtype='int64') diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ec6a118ec3639..18c6c9a6dd021 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1011,6 +1011,7 @@ def test_clip_against_series(self): lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) + assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index c52c41877d5c0..8e73c17684a16 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -931,6 +931,24 @@ def test_interp_limit_forward(self): limit_direction='FORWARD') assert_series_equal(result, expected) + def test_interp_unlimited(self): + # these test are for issue #16282 default Limit=None is unlimited + s = Series([np.nan, 1., 3., np.nan, np.nan, np.nan, 11., np.nan]) + expected = Series([1., 1., 3., 5., 7., 9., 11., 11.]) + result = s.interpolate(method='linear', + limit_direction='both') + assert_series_equal(result, expected) + + expected = Series([np.nan, 1., 3., 5., 7., 9., 11., 11.]) + result = s.interpolate(method='linear', + limit_direction='forward') + assert_series_equal(result, expected) + + expected = Series([1., 1., 3., 5., 7., 9., 11., np.nan]) + result = s.interpolate(method='linear', + limit_direction='backward') + assert_series_equal(result, expected) + def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 3af61b0a902d3..c22e2ca8e0dc8 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pandas import (Index, Series, DataFrame, date_range) +from pandas import (Index, Series, DataFrame, date_range, option_context) from pandas.core.index import MultiIndex from pandas.compat import lrange, range, u @@ -180,3 +180,21 @@ def test_timeseries_repr_object_dtype(self): ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] repr(ts2).splitlines()[-1] + + def test_latex_repr(self): + result = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & $\alpha$ \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + with option_context('display.latex.escape', False, + 'display.latex.repr', True): + s = Series([r'$\alpha$', 'b', 'c']) + assert result == s._repr_latex_() + + assert s._repr_latex_() is None diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 0312b76ec30a5..654d12b782f37 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1146,8 +1146,8 @@ def test_isnotnull(self): tm.assert_frame_equal(res.to_dense(), exp) -@pytest.mark.parametrize('index', [None, list('ab')]) # noqa: F811 -@pytest.mark.parametrize('columns', [None, list('cd')]) +@pytest.mark.parametrize('index', [None, list('abc')]) # noqa: F811 +@pytest.mark.parametrize('columns', [None, list('def')]) @pytest.mark.parametrize('fill_value', [None, 0, np.nan]) @pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): @@ -1156,7 +1156,9 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results - arr = np.eye(2, dtype=dtype) + arr = np.eye(3, dtype=dtype) + # GH 16179 + arr[0, 1] = dtype(2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype @@ -1245,6 +1247,26 @@ def test_from_to_scipy_object(spmatrix, fill_value): assert sdf.to_coo().dtype == res_dtype +def test_from_scipy_correct_ordering(spmatrix): + # GH 16179 + tm.skip_if_no_package('scipy') + + arr = np.arange(1, 5).reshape(2, 2) + try: + spm = spmatrix(arr) + assert spm.dtype == arr.dtype + except (TypeError, AssertionError): + # If conversion to sparse fails for this spmatrix type and arr.dtype, + # then the combination is not currently supported in NumPy, so we + # can just skip testing it thoroughly + return + + sdf = pd.SparseDataFrame(spm) + expected = pd.SparseDataFrame(arr) + tm.assert_sp_frame_equal(sdf, expected) + tm.assert_frame_equal(sdf.to_dense(), expected.to_dense()) + + class TestSparseDataFrameArithmetic(object): def test_numeric_op_scalar(self): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 03adf17f50300..3471f0b13b84b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -737,6 +737,17 @@ def test_unicode_print(self): assert _rep(c) == expected + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; c = pd.Categorical([])" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('c.', 1)) + def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') @@ -3822,6 +3833,43 @@ def test_cat_equality(self): pytest.raises(TypeError, lambda: a > b) pytest.raises(TypeError, lambda: b > a) + @pytest.mark.parametrize('ctor', [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ]) + def test_unordered_different_order_equal(self, ctor): + # https://github.com/pandas-dev/pandas/issues/16014 + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 == c2).all() + + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + result = c1 == c2 + tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) + + def test_unordered_different_categories_raises(self): + c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) + with tm.assert_raises_regex(TypeError, + "Categoricals can only be compared"): + c1 == c2 + + def test_compare_different_lengths(self): + c1 = Categorical([], categories=['a', 'b']) + c2 = Categorical([], categories=['a']) + msg = "Categories are different lengths" + with tm.assert_raises_regex(TypeError, msg): + c1 == c2 + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d7dbaccb87ee8..77ef535e08964 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -16,6 +16,7 @@ def test_mut_exclusive(): com._mut_exclusive(a=1, b=2) assert com._mut_exclusive(a=1, b=None) == 1 assert com._mut_exclusive(major=None, major_axis=None) is None + assert com._mut_exclusive(a=None, b=2) == 2 def test_get_callable_name(): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 2baedb82aa2a7..27e3c29a70a9f 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -4,7 +4,28 @@ import pytest import numpy as np # noqa from pandas import DataFrame +from pandas.compat import PY36 from pandas.util import testing as tm +import importlib + + +def import_module(name): + # we *only* want to skip if the module is truly not available + # and NOT just an actual import error because of pandas changes + + if PY36: + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) + + else: + try: + return importlib.import_module(name) + except ImportError as e: + if "No module named" in str(e) and name in str(e): + pytest.skip("skipping as {} not available".format(name)) + raise @pytest.fixture @@ -14,8 +35,8 @@ def df(): def test_dask(df): - toolz = pytest.importorskip('toolz') # noqa - dask = pytest.importorskip('dask') # noqa + toolz = import_module('toolz') # noqa + dask = import_module('dask') # noqa import dask.dataframe as dd @@ -26,14 +47,14 @@ def test_dask(df): def test_xarray(df): - xarray = pytest.importorskip('xarray') # noqa + xarray = import_module('xarray') # noqa assert df.to_xarray() is not None def test_statsmodels(): - statsmodels = pytest.importorskip('statsmodels') # noqa + statsmodels = import_module('statsmodels') # noqa import statsmodels.api as sm import statsmodels.formula.api as smf df = sm.datasets.get_rdataset("Guerry", "HistData").data @@ -42,7 +63,7 @@ def test_statsmodels(): def test_scikit_learn(df): - sklearn = pytest.importorskip('sklearn') # noqa + sklearn = import_module('sklearn') # noqa from sklearn import svm, datasets digits = datasets.load_digits() @@ -53,33 +74,32 @@ def test_scikit_learn(df): def test_seaborn(): - seaborn = pytest.importorskip('seaborn') + seaborn = import_module('seaborn') tips = seaborn.load_dataset("tips") seaborn.stripplot(x="day", y="total_bill", data=tips) def test_pandas_gbq(df): - pandas_gbq = pytest.importorskip('pandas-gbq') # noqa + pandas_gbq = import_module('pandas_gbq') # noqa -@tm.network def test_pandas_datareader(): - pandas_datareader = pytest.importorskip('pandas-datareader') # noqa - pandas_datareader.get_data_yahoo('AAPL') + pandas_datareader = import_module('pandas_datareader') # noqa + pandas_datareader.get_data_google('AAPL') def test_geopandas(): - geopandas = pytest.importorskip('geopandas') # noqa + geopandas = import_module('geopandas') # noqa fp = geopandas.datasets.get_path('naturalearth_lowres') assert geopandas.read_file(fp) is not None def test_pyarrow(df): - pyarrow = pytest.importorskip('pyarrow') # noqa + pyarrow = import_module('pyarrow') # noqa table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ab28b8b43f359..5a0132453cec5 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1195,6 +1195,37 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + def test_stack_order_with_unsorted_levels(self): + # GH 16323 + + def manual_compare_stacked(df, df_stacked, lev0, lev1): + assert all(df.loc[row, col] == + df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index for col in df.columns) + + # deep check for 1-row case + for width in [2, 3]: + levels_poss = itertools.product( + itertools.permutations([0, 1, 2], width), + repeat=2) + + for levels in levels_poss: + columns = MultiIndex(levels=levels, + labels=[[0, 0, 1, 1], + [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)]) + for stack_lev in range(2): + df_stacked = df.stack(stack_lev) + manual_compare_stacked(df, df_stacked, + stack_lev, 1 - stack_lev) + + # check multi-row case + mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]], + labels=[np.repeat(range(3), 3), np.tile(range(3), 3)]) + df = DataFrame(columns=mi, index=range(5), + data=np.arange(5 * len(mi)).reshape(5, -1)) + manual_compare_stacked(df, df.stack(0), 0, 1) + def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], labels=[[0], [0], [0]], diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 9734431c8b012..dadae026979d2 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3,6 +3,7 @@ from warnings import catch_warnings from datetime import datetime, timedelta from functools import partial +from textwrap import dedent import pytest import numpy as np @@ -282,8 +283,7 @@ def test_attribute_access(self): tm.assert_series_equal(r.A.sum(), r['A'].sum()) # getting - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - pytest.raises(AttributeError, lambda: r.F) + pytest.raises(AttributeError, lambda: r.F) # setting def f(): @@ -2820,6 +2820,19 @@ def test_back_compat_v180(self): expected = df.groupby('A').resample('4s').mean().ffill() assert_frame_equal(result, expected) + def test_tab_complete_ipython6_warning(self, ip): + from IPython.core.completer import provisionalcompleter + code = dedent("""\ + import pandas.util.testing as tm + s = tm.makeTimeSeries() + rs = s.resample("D") + """) + ip.run_code(code) + + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('rs.', 1)) + def test_deferred_with_groupby(self): # GH 12486 diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 634cd5fe2586b..6a640d62108b3 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -441,6 +441,20 @@ def test_closed(self): with pytest.raises(ValueError): df.rolling(window=3, closed='neither') + @pytest.mark.parametrize('roller', ['1s', 1]) + def tests_empty_df_rolling(self, roller): + # GH 15819 Verifies that datetime and integer rolling windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer rolling windows can be applied to + # empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() + tm.assert_frame_equal(result, expected) + class TestExpanding(Base): @@ -483,6 +497,24 @@ def test_numpy_compat(self): tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(e, func), dtype=np.float64) + @pytest.mark.parametrize( + 'expander', + [1, pytest.mark.xfail( + reason='GH 16425 expanding with offset not supported')('1s')]) + def tests_empty_df_expanding(self, expander): + # GH 15819 Verifies that datetime and integer expanding windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer expanding windows can be applied + # to empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame( + index=pd.DatetimeIndex([])).expanding(expander).sum() + tm.assert_frame_equal(result, expected) + class TestEWM(Base): diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index f82ad97d7b70f..664a97640387e 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -11,6 +11,21 @@ class TestToNumeric(object): + def test_empty(self): + # see gh-16302 + s = pd.Series([], dtype=object) + + res = to_numeric(s) + expected = pd.Series([], dtype=np.int64) + + tm.assert_series_equal(res, expected) + + # Original issue example + res = to_numeric(s, errors='coerce', downcast='integer') + expected = pd.Series([], dtype=np.int8) + + tm.assert_series_equal(res, expected) + def test_series(self): s = pd.Series(['1', '-3.14', '7']) res = to_numeric(s) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e1e6e43529a7d..289592939e3da 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -1,4 +1,5 @@ import pytest +import datetime from warnings import catch_warnings import numpy as np @@ -6,7 +7,7 @@ from pandas import DataFrame, Series, Index, MultiIndex from pandas.util import hash_array, hash_pandas_object -from pandas.core.util.hashing import hash_tuples +from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar import pandas.util.testing as tm @@ -79,6 +80,27 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] + def test_hash_tuple(self): + # test equivalence between hash_tuples and hash_tuple + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]: + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + def test_hash_scalar(self): + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]: + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), + categorize=True) + assert result[0] == expected[0] + def test_hash_tuples_err(self): for val in [5, 'foo', pd.Timestamp('20130101')]: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f6b572cdf7179..04461f84683f8 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -117,6 +117,63 @@ def round_trip_pickle(obj, path=None): return pd.read_pickle(path) +def round_trip_pathlib(writer, reader, path=None): + """ + Write an object to file specifed by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + round_trip_object : pandas object + The original object that was serialized and then re-read. + """ + + import pytest + Path = pytest.importorskip('pathlib').Path + if path is None: + path = '___pathlib___' + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path=None): + """ + Write an object to file specifed by a py.path LocalPath and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + round_trip_object : pandas object + The original object that was serialized and then re-read. + """ + import pytest + LocalPath = pytest.importorskip('py.path').local + if path is None: + path = '___localpath___' + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + def assert_almost_equal(left, right, check_exact=False, check_dtype='equiv', check_less_precise=False, **kwargs): diff --git a/scripts/build_dist.sh b/scripts/build_dist.sh index d6a7d0ba67239..c3f849ce7a6eb 100755 --- a/scripts/build_dist.sh +++ b/scripts/build_dist.sh @@ -10,11 +10,7 @@ read -p "Ok to continue (y/n)? " answer case ${answer:0:1} in y|Y ) echo "Building distribution" - rm -rf dist - git clean -xfd - python setup.py clean - python setup.py cython - python setup.py sdist --formats=gztar + ./build_dist_for_release.sh ;; * ) echo "Not building distribution" diff --git a/ci/install_release_build.sh b/scripts/build_dist_for_release.sh similarity index 69% rename from ci/install_release_build.sh rename to scripts/build_dist_for_release.sh index f8373176643fa..e77974ae08b0c 100644 --- a/ci/install_release_build.sh +++ b/scripts/build_dist_for_release.sh @@ -2,7 +2,7 @@ # this requires cython to be installed -# this builds the release cleanly +# this builds the release cleanly & is building on the current checkout rm -rf dist git clean -xfd python setup.py clean diff --git a/setup.py b/setup.py index d101358fb63dd..9a04bb6994869 100755 --- a/setup.py +++ b/setup.py @@ -524,7 +524,7 @@ def pxd(name): 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, '_libs.sparse': {'pyxfile': '_libs/sparse', - 'depends': (['pandas/core/sparse/sparse.pyx'] + + 'depends': (['pandas/_libs/sparse.pyx'] + _pxi_dep['sparse'])}, '_libs.testing': {'pyxfile': '_libs/testing', 'depends': ['pandas/_libs/testing.pyx']},