diff --git a/.travis.yml b/.travis.yml index 6c4d6897a69de..0d143d7f7133b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,6 +22,7 @@ matrix: - LOCALE_OVERRIDE="it_IT.UTF-8" - BUILD_TYPE=conda - JOB_NAME: "26_nslow_nnet" + - INSTALL_TEST=true - python: 2.7 env: - NOSE_ARGS="slow and not network and not disabled" @@ -30,6 +31,24 @@ matrix: - JOB_TAG=_LOCALE - BUILD_TYPE=conda - JOB_NAME: "27_slow_nnet_LOCALE" + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - BUILD_TYPE=conda + - JOB_NAME: "27_build_test" + - JOB_TAG=_BUILD_TEST + - BUILD_TEST=true + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - BUILD_TYPE=pydata + - JOB_NAME: "27_build_test" + - JOB_TAG=_BUILD_TEST + - BUILD_TEST=true - python: 2.7 env: - NOSE_ARGS="not slow and not disabled" @@ -115,6 +134,24 @@ matrix: - NUMPY_BUILD=master - BUILD_TYPE=pydata - PANDAS_TESTING_MODE="deprecate" + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - BUILD_TYPE=conda + - JOB_NAME: "27_build_test" + - JOB_TAG=_BUILD_TEST + - BUILD_TEST=true + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - BUILD_TYPE=pydata + - JOB_NAME: "27_build_test" + - JOB_TAG=_BUILD_TEST + - BUILD_TEST=true before_install: - echo "before_install" @@ -147,6 +184,7 @@ script: # nothing here, or failed tests won't fail travis after_script: + - ci/install_test.sh - if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi - source activate pandas && ci/print_versions.py - ci/print_skipped.py /tmp/nosetests.xml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f7041dbabdad5..284ac2fc5b169 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,8 +12,8 @@ navigate to the [GitHub "issues" tab](https://github.com/pydata/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pydata/pandas/issues?labels=Docs&sort=updated&state=open) -and [Good as first -PR](https://github.com/pydata/pandas/issues?labels=Good+as+first+PR&sort=updated&state=open) +and [Difficulty +Novice](https://github.com/pydata/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. Or maybe through using *pandas* you have an idea of you own or are @@ -137,6 +137,69 @@ clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. +### Creating a Development Environment + +An easy way to create a *pandas* development environment is as follows. + +- Install either Install Anaconda \ or + Install miniconda \ +- Make sure that you have + cloned the repository \ +- `cd` to the pandas source directory + +Tell `conda` to create a new environment, named `pandas_dev`, or any +name you would like for this environment by running: + + conda create -n pandas_dev --file ci/requirements_dev.txt + +For a python 3 environment + + conda create -n pandas_dev python=3 --file ci/requirements_dev.txt + +If you are on `windows`, then you will need to install the compiler +linkages: + + conda install -n pandas_dev libpython + +This will create the new environment, and not touch any of your existing +environments, nor any existing python installation. It will install all +of the basic dependencies of *pandas*, as well as the development and +testing tools. If you would like to install other dependencies, you can +install them as follows: + + conda install -n pandas_dev -c pandas pytables scipy + +To install *all* pandas dependencies you can do the following: + + conda install -n pandas_dev -c pandas --file ci/requirements_all.txt + +To work in this environment, `activate` it as follows: + + activate pandas_dev + +At which point, the prompt will change to indicate you are in the new +development environment. + +> **note** +> +> The above syntax is for `windows` environments. To work on +> `macosx/linux`, use: +> +> source activate pandas_dev + +To view your environments: + + conda info -e + +To return to you home root environment: + + deactivate + +See the full `conda` docs [here](http://conda.pydata.org/docs). + +At this point you can easily do an *in-place* install, as detailed in +the next section. + ### Making changes Before making your code changes, it is often necessary to build the code @@ -231,13 +294,19 @@ docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of `numpydoc` is included in the *pandas* source code. +It is easiest to +create a development environment \, then +install: + + conda install -n pandas_dev sphinx ipython + Furthermore, it is recommended to have all [optional dependencies](http://pandas.pydata.org/pandas-docs/dev/install.html#optional-dependencies) -installed. This is not needed, but be aware that you will see some error -messages. Because all the code in the documentation is executed during -the doc build, the examples using this optional dependencies will -generate errors. Run `pd.show_versions()` to get an overview of the -installed version of all dependencies. +installed. This is not strictly necessary, but be aware that you will +see some error messages. Because all the code in the documentation is +executed during the doc build, the examples using this optional +dependencies will generate errors. Run `pd.show_versions()` to get an +overview of the installed version of all dependencies. > **warning** > diff --git a/README.md b/README.md index cea7e8c6bfd72..8623ee170d154 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # pandas: powerful Python data analysis toolkit -![Travis-CI Build Status](https://travis-ci.org/pydata/pandas.svg) +[![Build Status](https://travis-ci.org/pydata/pandas.svg?branch=master)](https://travis-ci.org/pydata/pandas) ## What is it @@ -123,7 +123,7 @@ conda install pandas - xlrd >= 0.9.0 - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - Alternative Excel writer. -- [Google bq Command Line Tool](https://developers.google.com/bigquery/bq-command-line-tool/) +- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - Needed for `pandas.io.gbq` - [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. - One of the following combinations of libraries is needed to use the diff --git a/ci/install_conda.sh b/ci/install_conda.sh index 4c8a62c64979d..01b89807d164c 100755 --- a/ci/install_conda.sh +++ b/ci/install_conda.sh @@ -86,6 +86,9 @@ conda remove -n pandas pandas source activate pandas +pip install -U blosc # See https://github.com/pydata/pandas/pull/9783 +python -c 'import blosc; blosc.print_versions()' + # set the compiler cache to work if [ "$IRON_TOKEN" ]; then export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH @@ -96,7 +99,13 @@ if [ "$IRON_TOKEN" ]; then export CC='ccache gcc' fi -python setup.py build_ext --inplace && python setup.py develop +if [ "$BUILD_TEST" ]; then + pip uninstall --yes cython + pip install cython==0.15.1 + ( python setup.py build_ext --inplace && python setup.py develop ) || true +else + python setup.py build_ext --inplace && python setup.py develop +fi for package in beautifulsoup4; do pip uninstall --yes $package diff --git a/ci/install_pydata.sh b/ci/install_pydata.sh index 33a6d3854da22..f2ab5af34dc64 100755 --- a/ci/install_pydata.sh +++ b/ci/install_pydata.sh @@ -137,8 +137,15 @@ if [ "$IRON_TOKEN" ]; then fi # build pandas -python setup.py build_ext --inplace -python setup.py develop +if [ "$BUILD_TEST" ]; then + pip uninstall --yes cython + pip install cython==0.15.1 + ( python setup.py build_ext --inplace ) || true + ( python setup.py develop ) || true +else + python setup.py build_ext --inplace + python setup.py develop +fi # restore cython (if not numpy building) if [ -z "$NUMPY_BUILD" ]; then diff --git a/ci/install_test.sh b/ci/install_test.sh new file mode 100755 index 0000000000000..e01ad7b94a349 --- /dev/null +++ b/ci/install_test.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +echo "inside $0" + +if [ "$INSTALL_TEST" ]; then + source activate pandas + echo "Starting installation test." + conda uninstall cython || exit 1 + python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 + pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 + nosetests --exe -A "$NOSE_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml +else + echo "Skipping installation test." +fi +RET="$?" + +exit "$RET" diff --git a/ci/requirements-2.7_BUILD_TEST.txt b/ci/requirements-2.7_BUILD_TEST.txt new file mode 100644 index 0000000000000..b273ca043c4a2 --- /dev/null +++ b/ci/requirements-2.7_BUILD_TEST.txt @@ -0,0 +1,5 @@ +dateutil +pytz +numpy +cython +nose diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt new file mode 100644 index 0000000000000..c70efed96a8dd --- /dev/null +++ b/ci/requirements_all.txt @@ -0,0 +1,21 @@ +nose +sphinx +ipython +dateutil +pytz +openpyxl +xlsxwriter +xlrd +html5lib +patsy +beautiful-soup +numpy +cython +scipy +numexpr +pytables +matplotlib +lxml +sqlalchemy +bottleneck +pymysql diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt new file mode 100644 index 0000000000000..b273ca043c4a2 --- /dev/null +++ b/ci/requirements_dev.txt @@ -0,0 +1,5 @@ +dateutil +pytz +numpy +cython +nose diff --git a/ci/script.sh b/ci/script.sh index b1ba7ba79c816..fe9db792df5e7 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,9 +16,12 @@ fi "$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & # doc build log will be shown after tests - -echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml -nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml +if [ "$BUILD_TEST" ]; then + echo "We are not running nosetests as this is simply a build test." +else + echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml + nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml +fi RET="$?" diff --git a/doc/_templates/api_redirect.html b/doc/_templates/api_redirect.html new file mode 100644 index 0000000000000..24bdd8363830f --- /dev/null +++ b/doc/_templates/api_redirect.html @@ -0,0 +1,15 @@ +{% set pgn = pagename.split('.') -%} +{% if pgn[-2][0].isupper() -%} + {% set redirect = ["pandas", pgn[-2], pgn[-1], 'html']|join('.') -%} +{% else -%} + {% set redirect = ["pandas", pgn[-1], 'html']|join('.') -%} +{% endif -%} + + + + This API page has moved + + +

This API page has moved here.

+ + \ No newline at end of file diff --git a/doc/_templates/autosummary/accessor.rst b/doc/_templates/autosummary/accessor.rst new file mode 100644 index 0000000000000..1401121fb51c6 --- /dev/null +++ b/doc/_templates/autosummary/accessor.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module.split('.')[0] }} + +.. automethod:: {{ [module.split('.')[1], objname]|join('.') }} diff --git a/doc/_templates/autosummary/class_without_autosummary.rst b/doc/_templates/autosummary/class_without_autosummary.rst new file mode 100644 index 0000000000000..6676c672b206d --- /dev/null +++ b/doc/_templates/autosummary/class_without_autosummary.rst @@ -0,0 +1,6 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 1f59c38d75f93..1714e00030026 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -6,18 +6,16 @@ :suppress: import numpy as np - import random + import pandas as pd import os np.random.seed(123456) - from pandas import options - import pandas as pd np.set_printoptions(precision=4, suppress=True) import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - options.display.max_rows=15 + pd.options.display.mpl_style = 'default' + pd.options.display.max_rows = 15 #### portions of this were borrowed from the #### Pandas cheatsheet @@ -45,21 +43,22 @@ Object Creation See the :ref:`Data Structure Intro section ` -Creating a ``Series`` by passing a list of values, letting pandas create a default -integer index +Creating a :class:`Series` by passing a list of values, letting pandas create +a default integer index: .. ipython:: python s = pd.Series([1,3,5,np.nan,6,8]) s -Creating a ``DataFrame`` by passing a numpy array, with a datetime index and labeled columns. +Creating a :class:`DataFrame` by passing a numpy array, with a datetime index +and labeled columns: .. ipython:: python - dates = pd.date_range('20130101',periods=6) + dates = pd.date_range('20130101', periods=6) dates - df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) df Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. @@ -128,7 +127,7 @@ See the top & bottom rows of the frame df.head() df.tail(3) -Display the index,columns, and the underlying numpy data +Display the index, columns, and the underlying numpy data .. ipython:: python @@ -297,7 +296,7 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E']=['one', 'one','two','three','four','three'] + df2['E'] = ['one', 'one','two','three','four','three'] df2 df2[df2['E'].isin(['two','four'])] @@ -309,7 +308,7 @@ by the indexes .. ipython:: python - s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6)) + s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) s1 df['F'] = s1 @@ -358,7 +357,7 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) df1.loc[dates[0]:dates[1],'E'] = 1 df1 @@ -408,9 +407,9 @@ In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) s - df.sub(s,axis='index') + df.sub(s, axis='index') Apply @@ -430,7 +429,7 @@ See more at :ref:`Histogramming and Discretization ` .. ipython:: python - s = pd.Series(np.random.randint(0,7,size=10)) + s = pd.Series(np.random.randint(0, 7, size=10)) s s.value_counts() @@ -462,7 +461,7 @@ operations. See the :ref:`Merging section ` -Concatenating pandas objects together +Concatenating pandas objects together with :func:`concat`: .. ipython:: python @@ -515,9 +514,9 @@ See the :ref:`Grouping section ` .. ipython:: python df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], + 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], + 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df @@ -555,7 +554,8 @@ Stack df2 = df[:4] df2 -The ``stack`` function "compresses" a level in the DataFrame's columns. +The :meth:`~DataFrame.stack` method "compresses" a level in the DataFrame's +columns. .. ipython:: python @@ -563,8 +563,8 @@ The ``stack`` function "compresses" a level in the DataFrame's columns. stacked With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the -``index``), the inverse operation of ``stack`` is ``unstack``, which by default -unstacks the **last level**: +``index``), the inverse operation of :meth:`~DataFrame.stack` is +:meth:`~DataFrame.unstack`, which by default unstacks the **last level**: .. ipython:: python @@ -708,7 +708,8 @@ Plotting @savefig series_plot_basic.png ts.plot() -On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: +On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the +columns with labels: .. ipython:: python diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 1749409c863df..850f59c2713eb 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -6,15 +6,10 @@ :suppress: import numpy as np - import random - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 import pandas as pd - randn = np.random.randn - randint = np.random.randint + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - from pandas.compat import range, zip + pd.options.display.max_rows=15 ****************************** MultiIndex / Advanced Indexing @@ -80,10 +75,10 @@ demo different ways to initialize MultiIndexes. tuples = list(zip(*arrays)) tuples - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) index - s = Series(randn(8), index=index) + s = pd.Series(np.random.randn(8), index=index) s When you want every pairing of the elements in two iterables, it can be easier @@ -92,7 +87,7 @@ to use the ``MultiIndex.from_product`` function: .. ipython:: python iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - MultiIndex.from_product(iterables, names=['first', 'second']) + pd.MultiIndex.from_product(iterables, names=['first', 'second']) As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically: @@ -101,9 +96,9 @@ DataFrame to construct a MultiIndex automatically: arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] - s = Series(randn(8), index=arrays) + s = pd.Series(np.random.randn(8), index=arrays) s - df = DataFrame(randn(8, 4), index=arrays) + df = pd.DataFrame(np.random.randn(8, 4), index=arrays) df All of the ``MultiIndex`` constructors accept a ``names`` argument which stores @@ -119,9 +114,9 @@ of the index is up to you: .. ipython:: python - df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) df - DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) We've "sparsified" the higher levels of the indexes to make the console output a bit easier on the eyes. @@ -131,7 +126,7 @@ tuples as atomic labels on an axis: .. ipython:: python - Series(randn(8), index=tuples) + pd.Series(np.random.randn(8), index=tuples) The reason that the ``MultiIndex`` matters is that it can allow you to do grouping, selection, and reshaping operations as we will describe below and in @@ -282,16 +277,16 @@ As usual, **both sides** of the slicers are included as this is label indexing. def mklbl(prefix,n): return ["%s%s" % (prefix,i) for i in range(n)] - miindex = MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) - micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), - ('b','foo'),('b','bah')], - names=['lvl0', 'lvl1']) - dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), - index=miindex, - columns=micolumns).sortlevel().sortlevel(axis=1) + miindex = pd.MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = pd.DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) dfmi Basic multi-index slicing using slices, lists, and labels. @@ -418,9 +413,9 @@ instance: .. ipython:: python - midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], - labels=[[1,1,0,0],[1,0,1,0]]) - df = DataFrame(randn(4,2), index=midx) + midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = pd.DataFrame(np.random.randn(4,2), index=midx) df df2 = df.mean(level=0) df2 @@ -471,7 +466,7 @@ labels will be sorted lexicographically! .. ipython:: python import random; random.shuffle(tuples) - s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s s.sortlevel(0) s.sortlevel(1) @@ -509,13 +504,13 @@ an exception. Here is a concrete example to illustrate this: .. ipython:: python tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] - idx = MultiIndex.from_tuples(tuples) + idx = pd.MultiIndex.from_tuples(tuples) idx.lexsort_depth reordered = idx[[1, 0, 3, 2]] reordered.lexsort_depth - s = Series(randn(4), index=reordered) + s = pd.Series(np.random.randn(4), index=reordered) s.ix['a':'a'] However: @@ -540,7 +535,7 @@ index positions. ``take`` will also accept negative integers as relative positio .. ipython:: python - index = Index(randint(0, 1000, 10)) + index = pd.Index(np.random.randint(0, 1000, 10)) index positions = [0, 9, 3] @@ -548,7 +543,7 @@ index positions. ``take`` will also accept negative integers as relative positio index[positions] index.take(positions) - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.iloc[positions] ser.take(positions) @@ -558,7 +553,7 @@ row or column positions. .. ipython:: python - frm = DataFrame(randn(5, 3)) + frm = pd.DataFrame(np.random.randn(5, 3)) frm.take([1, 4, 3]) @@ -569,11 +564,11 @@ intended to work on boolean indices and may return unexpected results. .. ipython:: python - arr = randn(10) + arr = np.random.randn(10) arr.take([False, False, True, True]) arr[[0, 1]] - ser = Series(randn(10)) + ser = pd.Series(np.random.randn(10)) ser.take([False, False, True, True]) ser.ix[[0, 1]] @@ -583,17 +578,102 @@ faster than fancy indexing. .. ipython:: - arr = randn(10000, 5) + arr = np.random.randn(10000, 5) indexer = np.arange(10000) random.shuffle(indexer) timeit arr[indexer] timeit arr.take(indexer, axis=0) - ser = Series(arr[:, 0]) + ser = pd.Series(arr[:, 0]) timeit ser.ix[indexer] timeit ser.take(indexer) +.. _indexing.categoricalindex: + +CategoricalIndex +---------------- + +.. versionadded:: 0.16.1 + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython:: python + + df = pd.DataFrame({'A': np.arange(6), + 'B': list('aabbca')}) + df['B'] = df['B'].astype('category', categories=list('cab')) + df + df.dtypes + df.B.cat.categories + +Setting the index, will create create a ``CategoricalIndex`` + +.. ipython:: python + + df2 = df.set_index('B') + df2.index + +Indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an ``Index`` with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython:: python + + df2.loc['a'] + +These PRESERVE the ``CategoricalIndex`` + +.. ipython:: python + + df2.loc['a'].index + +Sorting will order by the order of the categories + +.. ipython:: python + + df2.sort_index() + +Groupby operations on the index will preserve the index nature as well + +.. ipython:: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +Reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +.. warning:: + + Reshaping and Comparision operations on a ``CategoricalIndex`` must have the same categories + or a ``TypeError`` will be raised. + + .. code-block:: python + + In [9]: df3 = pd.DataFrame({'A' : np.arange(6), + 'B' : pd.Series(list('aabbca')).astype('category')}) + + In [11]: df3 = df3.set_index('B') + + In [11]: df3.index + Out[11]: CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], categories=[u'a', u'b', u'c'], ordered=False, name=u'B', dtype='category') + + In [12]: pd.concat([df2, df3] + TypeError: categories must match existing categories when appending + .. _indexing.float64index: Float64Index @@ -616,9 +696,9 @@ same. .. ipython:: python - indexf = Index([1.5, 2, 3, 4.5, 5]) + indexf = pd.Index([1.5, 2, 3, 4.5, 5]) indexf - sf = Series(range(5),index=indexf) + sf = pd.Series(range(5), index=indexf) sf Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) @@ -660,17 +740,17 @@ In non-float indexes, slicing using floats will raise a ``TypeError`` .. code-block:: python - In [1]: Series(range(5))[3.5] + In [1]: pd.Series(range(5))[3.5] TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) - In [1]: Series(range(5))[3.5:4.5] + In [1]: pd.Series(range(5))[3.5:4.5] TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) Using a scalar float indexer will be deprecated in a future version, but is allowed for now. .. code-block:: python - In [3]: Series(range(5))[3.0] + In [3]: pd.Series(range(5))[3.0] Out[3]: 3 Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat @@ -679,12 +759,12 @@ example be millisecond offsets. .. ipython:: python - dfir = concat([DataFrame(randn(5,2), - index=np.arange(5) * 250.0, - columns=list('AB')), - DataFrame(randn(6,2), - index=np.arange(4,10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat([pd.DataFrame(np.random.randn(5,2), + index=np.arange(5) * 250.0, + columns=list('AB')), + pd.DataFrame(np.random.randn(6,2), + index=np.arange(4,10) * 250.1, + columns=list('AB'))]) dfir Selection operations then will always work on a value basis, for all selection operators. @@ -706,4 +786,3 @@ Of course if you need integer based selection, then use ``iloc`` .. ipython:: python dfir.iloc[0:5] - diff --git a/doc/source/api.rst b/doc/source/api.rst index af9f8c84388bd..f5ba03afc9f19 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -158,6 +158,7 @@ Top-level dealing with datetimelike bdate_range period_range timedelta_range + infer_freq Top-level evaluation ~~~~~~~~~~~~~~~~~~~~ @@ -357,6 +358,8 @@ Computations / Descriptive Stats Series.median Series.min Series.mode + Series.nlargest + Series.nsmallest Series.pct_change Series.prod Series.quantile @@ -390,6 +393,7 @@ Reindexing / Selection / Label manipulation Series.reindex_like Series.rename Series.reset_index + Series.sample Series.select Series.take Series.tail @@ -468,6 +472,7 @@ These can be accessed like ``Series.dt.``. Series.dt.microsecond Series.dt.nanosecond Series.dt.second + Series.dt.week Series.dt.weekofyear Series.dt.dayofweek Series.dt.weekday @@ -479,6 +484,10 @@ These can be accessed like ``Series.dt.``. Series.dt.is_quarter_end Series.dt.is_year_start Series.dt.is_year_end + Series.dt.daysinmonth + Series.dt.days_in_month + Series.dt.tz + Series.dt.freq **Datetime Methods** @@ -490,6 +499,7 @@ These can be accessed like ``Series.dt.``. Series.dt.to_pydatetime Series.dt.tz_localize Series.dt.tz_convert + Series.dt.normalize **Timedelta Properties** @@ -533,17 +543,22 @@ strings and apply several methods to it. These can be acccessed like Series.str.find Series.str.findall Series.str.get + Series.str.index Series.str.join Series.str.len Series.str.ljust Series.str.lower Series.str.lstrip Series.str.match + Series.str.normalize Series.str.pad + Series.str.partition Series.str.repeat Series.str.replace Series.str.rfind + Series.str.rindex Series.str.rjust + Series.str.rpartition Series.str.rstrip Series.str.slice Series.str.slice_replace @@ -552,7 +567,9 @@ strings and apply several methods to it. These can be acccessed like Series.str.strip Series.str.swapcase Series.str.title + Series.str.translate Series.str.upper + Series.str.wrap Series.str.zfill Series.str.isalnum Series.str.isalpha @@ -565,6 +582,20 @@ strings and apply several methods to it. These can be acccessed like Series.str.isdecimal Series.str.get_dummies +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series class page) + +.. + .. autosummary:: + :toctree: generated/ + :template: autosummary/accessor.rst + + Series.str + Series.cat + Series.dt + + .. _api.categorical: Categorical @@ -572,22 +603,28 @@ Categorical If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the -following usable methods and properties (all available as ``Series.cat.``). +following usable methods and properties: .. autosummary:: :toctree: generated/ + :template: autosummary/accessor_attribute.rst + + Series.cat.categories + Series.cat.ordered + Series.cat.codes + +.. autosummary:: + :toctree: generated/ + :template: autosummary/accessor_method.rst - Categorical.categories - Categorical.ordered - Categorical.rename_categories - Categorical.reorder_categories - Categorical.add_categories - Categorical.remove_categories - Categorical.remove_unused_categories - Categorical.set_categories - Categorical.as_ordered - Categorical.as_unordered - Categorical.codes + Series.cat.rename_categories + Series.cat.reorder_categories + Series.cat.add_categories + Series.cat.remove_categories + Series.cat.remove_unused_categories + Series.cat.set_categories + Series.cat.as_ordered + Series.cat.as_unordered To create a Series of dtype ``category``, use ``cat = s.astype("category")``. @@ -596,8 +633,13 @@ adding ordering information or special categories is need at creation time of th .. autosummary:: :toctree: generated/ + :template: autosummary/class_without_autosummary.rst Categorical + +.. autosummary:: + :toctree: generated/ + Categorical.from_codes ``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts @@ -823,6 +865,7 @@ Reindexing / Selection / Label manipulation DataFrame.reindex_like DataFrame.rename DataFrame.reset_index + DataFrame.sample DataFrame.select DataFrame.set_index DataFrame.tail @@ -1071,6 +1114,7 @@ Reindexing / Selection / Label manipulation Panel.reindex_axis Panel.reindex_like Panel.rename + Panel.sample Panel.select Panel.take Panel.truncate @@ -1220,8 +1264,6 @@ Modifying and Computations Index.argmax Index.copy Index.delete - Index.diff - Index.sym_diff Index.drop Index.drop_duplicates Index.duplicated @@ -1267,15 +1309,17 @@ Time-specific operations Index.shift -Combining / joining / merging -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Combining / joining / set operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ Index.append - Index.intersection Index.join + Index.intersection Index.union + Index.difference + Index.sym_diff Selecting ~~~~~~~~~ @@ -1291,6 +1335,34 @@ Selecting Index.slice_indexer Index.slice_locs +.. _api.categoricalindex: + +CategoricalIndex +---------------- + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex + +Categorical Components +~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex.codes + CategoricalIndex.categories + CategoricalIndex.ordered + CategoricalIndex.rename_categories + CategoricalIndex.reorder_categories + CategoricalIndex.add_categories + CategoricalIndex.remove_categories + CategoricalIndex.remove_unused_categories + CategoricalIndex.set_categories + CategoricalIndex.as_ordered + CategoricalIndex.as_unordered + .. _api.datetimeindex: DatetimeIndex @@ -1332,6 +1404,7 @@ Time/Date Components DatetimeIndex.is_quarter_end DatetimeIndex.is_year_start DatetimeIndex.is_year_end + DatetimeIndex.inferred_freq Selecting ~~~~~~~~~ @@ -1382,6 +1455,7 @@ Components TimedeltaIndex.microseconds TimedeltaIndex.nanoseconds TimedeltaIndex.components + TimedeltaIndex.inferred_freq Conversion ~~~~~~~~~~ @@ -1521,230 +1595,3 @@ Working with options get_option set_option option_context - - -.. - HACK - see github issue #4539. To ensure old links remain valid, include - here the autosummaries with previous currentmodules as a comment and add - them to a hidden toctree (to avoid warnings): - -.. toctree:: - :hidden: - - generated/pandas.core.common.isnull - generated/pandas.core.common.notnull - generated/pandas.core.reshape.get_dummies - generated/pandas.io.clipboard.read_clipboard - generated/pandas.io.excel.ExcelFile.parse - generated/pandas.io.excel.read_excel - generated/pandas.io.html.read_html - generated/pandas.io.json.read_json - generated/pandas.io.parsers.read_csv - generated/pandas.io.parsers.read_fwf - generated/pandas.io.parsers.read_table - generated/pandas.io.pickle.read_pickle - generated/pandas.io.pytables.HDFStore.append - generated/pandas.io.pytables.HDFStore.get - generated/pandas.io.pytables.HDFStore.put - generated/pandas.io.pytables.HDFStore.select - generated/pandas.io.pytables.read_hdf - generated/pandas.io.sql.read_sql - generated/pandas.io.sql.read_frame - generated/pandas.io.sql.write_frame - generated/pandas.io.stata.read_stata - generated/pandas.stats.moments.ewma - generated/pandas.stats.moments.ewmcorr - generated/pandas.stats.moments.ewmcov - generated/pandas.stats.moments.ewmstd - generated/pandas.stats.moments.ewmvar - generated/pandas.stats.moments.expanding_apply - generated/pandas.stats.moments.expanding_corr - generated/pandas.stats.moments.expanding_count - generated/pandas.stats.moments.expanding_cov - generated/pandas.stats.moments.expanding_kurt - generated/pandas.stats.moments.expanding_mean - generated/pandas.stats.moments.expanding_median - generated/pandas.stats.moments.expanding_quantile - generated/pandas.stats.moments.expanding_skew - generated/pandas.stats.moments.expanding_std - generated/pandas.stats.moments.expanding_sum - generated/pandas.stats.moments.expanding_var - generated/pandas.stats.moments.rolling_apply - generated/pandas.stats.moments.rolling_corr - generated/pandas.stats.moments.rolling_count - generated/pandas.stats.moments.rolling_cov - generated/pandas.stats.moments.rolling_kurt - generated/pandas.stats.moments.rolling_mean - generated/pandas.stats.moments.rolling_median - generated/pandas.stats.moments.rolling_quantile - generated/pandas.stats.moments.rolling_skew - generated/pandas.stats.moments.rolling_std - generated/pandas.stats.moments.rolling_sum - generated/pandas.stats.moments.rolling_var - generated/pandas.tools.merge.concat - generated/pandas.tools.merge.merge - generated/pandas.tools.pivot.pivot_table - generated/pandas.tseries.tools.to_datetime - -.. - .. currentmodule:: pandas.io.pickle - - .. autosummary:: - :toctree: generated/ - - read_pickle - - .. currentmodule:: pandas.io.parsers - - .. autosummary:: - :toctree: generated/ - - read_table - read_csv - read_fwf - - .. currentmodule:: pandas.io.clipboard - - .. autosummary:: - :toctree: generated/ - - read_clipboard - - .. currentmodule:: pandas.io.excel - - .. autosummary:: - :toctree: generated/ - - read_excel - ExcelFile.parse - - .. currentmodule:: pandas.io.json - - .. autosummary:: - :toctree: generated/ - - read_json - - .. currentmodule:: pandas.io.html - - .. autosummary:: - :toctree: generated/ - - read_html - - .. currentmodule:: pandas.io.pytables - - .. autosummary:: - :toctree: generated/ - - read_hdf - HDFStore.put - HDFStore.append - HDFStore.get - HDFStore.select - - .. currentmodule:: pandas.io.sql - - .. autosummary:: - :toctree: generated/ - - read_sql - read_frame - write_frame - - .. currentmodule:: pandas.io.stata - - .. autosummary:: - :toctree: generated/ - - read_stata - StataReader.data - StataReader.data_label - StataReader.value_labels - StataReader.variable_labels - StataWriter.write_file - - .. currentmodule:: pandas.tools.pivot - - .. autosummary:: - :toctree: generated/ - - pivot_table - - .. currentmodule:: pandas.tools.merge - - .. autosummary:: - :toctree: generated/ - - merge - concat - - .. currentmodule:: pandas.core.reshape - - .. autosummary:: - :toctree: generated/ - - get_dummies - - .. currentmodule:: pandas.core.common - - .. autosummary:: - :toctree: generated/ - - isnull - notnull - - .. currentmodule:: pandas.tseries.tools - - .. autosummary:: - :toctree: generated/ - - to_datetime - - - .. currentmodule:: pandas.stats.moments - - .. autosummary:: - :toctree: generated/ - - rolling_count - rolling_sum - rolling_mean - rolling_median - rolling_var - rolling_std - rolling_corr - rolling_cov - rolling_skew - rolling_kurt - rolling_apply - rolling_quantile - - - .. currentmodule:: pandas.stats.moments - - .. autosummary:: - :toctree: generated/ - - expanding_count - expanding_sum - expanding_mean - expanding_median - expanding_var - expanding_std - expanding_corr - expanding_cov - expanding_skew - expanding_kurt - expanding_apply - expanding_quantile - - - .. autosummary:: - :toctree: generated/ - - ewma - ewmstd - ewmvar - ewmcorr - ewmcov diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 8e78ac597479b..d16feb3a6c448 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1,16 +1,14 @@ .. currentmodule:: pandas -.. _basics: .. ipython:: python :suppress: import numpy as np - from pandas import * - randn = np.random.randn + import pandas as pd np.set_printoptions(precision=4, suppress=True) - from pandas.compat import lrange - options.display.max_rows=15 + pd.options.display.max_rows = 15 +.. _basics: ============================== Essential Basic Functionality @@ -22,26 +20,26 @@ the previous section: .. ipython:: python - index = date_range('1/1/2000', periods=8) - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + index = pd.date_range('1/1/2000', periods=8) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = pd.DataFrame(np.random.randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) .. _basics.head_tail: Head and Tail ------------- -To view a small sample of a Series or DataFrame object, use the ``head`` and -``tail`` methods. The default number of elements to display is five, but you -may pass a custom number. +To view a small sample of a Series or DataFrame object, use the +:meth:`~DataFrame.head` and :meth:`~DataFrame.tail` methods. The default number +of elements to display is five, but you may pass a custom number. .. ipython:: python - long_series = Series(randn(1000)) + long_series = pd.Series(np.random.randn(1000)) long_series.head() long_series.tail(3) @@ -134,16 +132,18 @@ be handled simultaneously. Matching / broadcasting behavior ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -DataFrame has the methods **add, sub, mul, div** and related functions **radd, -rsub, ...** for carrying out binary operations. For broadcasting behavior, +DataFrame has the methods :meth:`~DataFrame.add`, :meth:`~DataFrame.sub`, +:meth:`~DataFrame.mul`, :meth:`~DataFrame.div` and related functions +:meth:`~DataFrame.radd`, :meth:`~DataFrame.rsub`, ... +for carrying out binary operations. For broadcasting behavior, Series input is of primary interest. Using these functions, you can use to either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = DataFrame({'one' : Series(randn(3), index=['a', 'b', 'c']), - 'two' : Series(randn(4), index=['a', 'b', 'c', 'd']), - 'three' : Series(randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) df row = df.ix[1] column = df['two'] @@ -164,8 +164,8 @@ Furthermore you can align a level of a multi-indexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], - names=['first','second']) + dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], + names=['first','second']) dfmi.sub(column, axis=0, level='second') With Panel, describing the matching behavior is a bit more difficult, so @@ -234,7 +234,8 @@ see :ref:`here` Boolean Reductions ~~~~~~~~~~~~~~~~~~ -You can apply the reductions: ``empty``, ``any()``, ``all()``, and ``bool()`` to provide a +You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, +:meth:`~DataFrame.all`, and :meth:`~DataFrame.bool` to provide a way to summarize a boolean result. .. ipython:: python @@ -248,21 +249,22 @@ You can reduce to a final boolean value. (df>0).any().any() -You can test if a pandas object is empty, via the ``empty`` property. +You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` property. .. ipython:: python df.empty - DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list('ABC')).empty -To evaluate single-element pandas objects in a boolean context, use the method ``.bool()``: +To evaluate single-element pandas objects in a boolean context, use the method +:meth:`~DataFrame.bool`: .. ipython:: python - Series([True]).bool() - Series([False]).bool() - DataFrame([[True]]).bool() - DataFrame([[False]]).bool() + pd.Series([True]).bool() + pd.Series([False]).bool() + pd.DataFrame([[True]]).bool() + pd.DataFrame([[False]]).bool() .. warning:: @@ -311,8 +313,8 @@ That is because NaNs do not compare as equals: np.nan == np.nan So, as of v0.13.1, NDFrames (such as Series, DataFrames, and Panels) -have an ``equals`` method for testing equality, with NaNs in corresponding -locations treated as equal. +have an :meth:`~DataFrame.equals` method for testing equality, with NaNs in +corresponding locations treated as equal. .. ipython:: python @@ -323,8 +325,8 @@ equality to be True: .. ipython:: python - df1 = DataFrame({'col':['foo', 0, np.nan]}) - df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df1 = pd.DataFrame({'col':['foo', 0, np.nan]}) + df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) df1.equals(df2) df1.equals(df2.sort()) @@ -339,14 +341,15 @@ be of "higher quality". However, the lower quality series might extend further back in history or have more complete data coverage. As such, we would like to combine two DataFrame objects where missing values in one DataFrame are conditionally filled with like-labeled values from the other DataFrame. The -function implementing this operation is ``combine_first``, which we illustrate: +function implementing this operation is :meth:`~DataFrame.combine_first`, +which we illustrate: .. ipython:: python - df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], - 'B' : [np.nan, 2., 3., np.nan, 6.]}) - df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], - 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) df1 df2 df1.combine_first(df2) @@ -354,16 +357,16 @@ function implementing this operation is ``combine_first``, which we illustrate: General DataFrame Combine ~~~~~~~~~~~~~~~~~~~~~~~~~ -The ``combine_first`` method above calls the more general DataFrame method -``combine``. This method takes another DataFrame and a combiner function, -aligns the input DataFrame and then passes the combiner function pairs of -Series (i.e., columns whose names are the same). +The :meth:`~DataFrame.combine_first` method above calls the more general +DataFrame method :meth:`~DataFrame.combine`. This method takes another DataFrame +and a combiner function, aligns the input DataFrame and then passes the combiner +function pairs of Series (i.e., columns whose names are the same). -So, for instance, to reproduce ``combine_first`` as above: +So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: .. ipython:: python - combiner = lambda x, y: np.where(isnull(x), y, x) + combiner = lambda x, y: np.where(pd.isnull(x), y, x) df1.combine(df2, combiner) .. _basics.stats: @@ -374,8 +377,9 @@ Descriptive statistics A large number of methods for computing descriptive statistics and other related operations on :ref:`Series `, :ref:`DataFrame `, and :ref:`Panel `. Most of these -are aggregations (hence producing a lower-dimensional result) like **sum**, -**mean**, and **quantile**, but some of them, like **cumsum** and **cumprod**, +are aggregations (hence producing a lower-dimensional result) like +:meth:`~DataFrame.sum`, :meth:`~DataFrame.mean`, and :meth:`~DataFrame.quantile`, +but some of them, like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`, produce an object of the same size. Generally speaking, these methods take an **axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be specified by name or integer: @@ -412,8 +416,8 @@ standard deviation 1), very concisely: xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) xs_stand.std(1) -Note that methods like **cumsum** and **cumprod** preserve the location of NA -values: +Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` +preserve the location of NA values: .. ipython:: python @@ -456,12 +460,12 @@ will exclude NAs on Series input by default: np.mean(df['one']) np.mean(df['one'].values) -``Series`` also has a method ``nunique`` which will return the number of unique -non-null values: +``Series`` also has a method :meth:`~Series.nunique` which will return the +number of unique non-null values: .. ipython:: python - series = Series(randn(500)) + series = pd.Series(np.random.randn(500)) series[20:500] = np.nan series[10:20] = 5 series.nunique() @@ -471,16 +475,16 @@ non-null values: Summarizing data: describe ~~~~~~~~~~~~~~~~~~~~~~~~~~ -There is a convenient ``describe`` function which computes a variety of summary +There is a convenient :meth:`~DataFrame.describe` function which computes a variety of summary statistics about a Series or the columns of a DataFrame (excluding NAs of course): .. ipython:: python - series = Series(randn(1000)) + series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.ix[::2] = np.nan frame.describe() @@ -492,21 +496,21 @@ You can select specific percentiles to include in the output: By default, the median is always included. -For a non-numerical Series object, `describe` will give a simple summary of the -number of unique values and most frequently occurring values: - +For a non-numerical Series object, :meth:`~Series.describe` will give a simple +summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) s.describe() -Note that on a mixed-type DataFrame object, `describe` will restrict the summary to -include only numerical columns or, if none are, only categorical columns: +Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will +restrict the summary to include only numerical columns or, if none are, only +categorical columns: .. ipython:: python - frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) frame.describe() This behaviour can be controlled by providing a list of types as ``include``/``exclude`` @@ -518,33 +522,36 @@ arguments. The special value ``all`` can also be used: frame.describe(include=['number']) frame.describe(include='all') -That feature relies on :ref:`select_dtypes `. Refer to there for details about accepted inputs. +That feature relies on :ref:`select_dtypes `. Refer to +there for details about accepted inputs. .. _basics.idxmin: Index of Min/Max Values ~~~~~~~~~~~~~~~~~~~~~~~ -The ``idxmin`` and ``idxmax`` functions on Series and DataFrame compute the -index labels with the minimum and maximum corresponding values: +The :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` functions on Series +and DataFrame compute the index labels with the minimum and maximum +corresponding values: .. ipython:: python - s1 = Series(randn(5)) + s1 = pd.Series(np.random.randn(5)) s1 s1.idxmin(), s1.idxmax() - df1 = DataFrame(randn(5,3), columns=['A','B','C']) + df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C']) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) When there are multiple rows (or columns) matching the minimum or maximum -value, ``idxmin`` and ``idxmax`` return the first matching index: +value, :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` return the first +matching index: .. ipython:: python - df3 = DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) df3 df3['A'].idxmin() @@ -557,59 +564,59 @@ value, ``idxmin`` and ``idxmax`` return the first matching index: Value counts (histogramming) / Mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ``value_counts`` Series method and top-level function computes a histogram +The :meth:`~Series.value_counts` Series method and top-level function computes a histogram of a 1D array of values. It can also be used as a function on regular arrays: .. ipython:: python data = np.random.randint(0, 7, size=50) data - s = Series(data) + s = pd.Series(data) s.value_counts() - value_counts(data) + pd.value_counts(data) Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: .. ipython:: python - s5 = Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) + s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50)}) df5.mode() Discretization and quantiling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Continuous values can be discretized using the ``cut`` (bins based on values) -and ``qcut`` (bins based on sample quantiles) functions: +Continuous values can be discretized using the :func:`cut` (bins based on values) +and :func:`qcut` (bins based on sample quantiles) functions: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, 4) + factor = pd.cut(arr, 4) factor - factor = cut(arr, [-5, -1, 0, 1, 5]) + factor = pd.cut(arr, [-5, -1, 0, 1, 5]) factor -``qcut`` computes sample quantiles. For example, we could slice up some +:func:`qcut` computes sample quantiles. For example, we could slice up some normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, .25, .5, .75, 1]) factor - value_counts(factor) + pd.value_counts(factor) We can also pass infinite values to define the bins: .. ipython:: python arr = np.random.randn(20) - factor = cut(arr, [-np.inf, 0, np.inf]) + factor = pd.cut(arr, [-np.inf, 0, np.inf]) factor .. _basics.apply: @@ -618,8 +625,8 @@ Function application -------------------- Arbitrary functions can be applied along the axes of a DataFrame or Panel -using the ``apply`` method, which, like the descriptive statistics methods, -take an optional ``axis`` argument: +using the :meth:`~DataFrame.apply` method, which, like the descriptive +statistics methods, take an optional ``axis`` argument: .. ipython:: python @@ -629,20 +636,20 @@ take an optional ``axis`` argument: df.apply(np.cumsum) df.apply(np.exp) -Depending on the return type of the function passed to ``apply``, the result -will either be of lower dimension or the same dimension. +Depending on the return type of the function passed to :meth:`~DataFrame.apply`, +the result will either be of lower dimension or the same dimension. -``apply`` combined with some cleverness can be used to answer many questions +:meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the maximum value for each column occurred: .. ipython:: python - tsdf = DataFrame(randn(1000, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=1000)) tsdf.apply(lambda x: x.idxmax()) -You may also pass additional arguments and keyword arguments to the ``apply`` +You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` method. For instance, consider the following function you would like to apply: .. code-block:: python @@ -662,16 +669,16 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = DataFrame(randn(10, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) tsdf.values[3:7] = np.nan .. ipython:: python tsdf - tsdf.apply(Series.interpolate) + tsdf.apply(pd.Series.interpolate) -Finally, ``apply`` takes an argument ``raw`` which is False by default, which +Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which converts each row or column into a Series before applying the function. When set to True, the passed function will instead receive an ndarray object, which has positive performance implications if you do not need the indexing @@ -687,9 +694,9 @@ Applying elementwise Python functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since not all functions can be vectorized (accept NumPy arrays and return -another array or value), the methods ``applymap`` on DataFrame and analogously -``map`` on Series accept any Python function taking a single value and -returning a single value. For example: +another array or value), the methods :meth:`~DataFrame.applymap` on DataFrame +and analogously :meth:`~Series.map` on Series accept any Python function taking +a single value and returning a single value. For example: .. ipython:: python :suppress: @@ -703,16 +710,15 @@ returning a single value. For example: df4['one'].map(f) df4.applymap(f) -``Series.map`` has an additional feature which is that it can be used to easily +:meth:`Series.map` has an additional feature which is that it can be used to easily "link" or "map" values defined by a secondary series. This is closely related to :ref:`merging/joining functionality `: - .. ipython:: python - s = Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = Series({'six' : 6., 'seven' : 7.}) + s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], + index=['a', 'b', 'c', 'd', 'e']) + t = pd.Series({'six' : 6., 'seven' : 7.}) s s.map(t) @@ -789,7 +795,7 @@ This is equivalent to the following .. ipython:: python - result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) + result = pd.Panel(dict([ (ax, f(panel.loc[:,:,ax])) for ax in panel.minor_axis ])) result result.loc[:,:,'ItemA'] @@ -797,12 +803,11 @@ This is equivalent to the following .. _basics.reindexing: - Reindexing and altering labels ------------------------------ -``reindex`` is the fundamental data alignment method in pandas. It is used to -implement nearly all other features relying on label-alignment +:meth:`~Series.reindex` is the fundamental data alignment method in pandas. +It is used to implement nearly all other features relying on label-alignment functionality. To *reindex* means to conform the data to match a given set of labels along a particular axis. This accomplishes several things: @@ -816,7 +821,7 @@ Here is a simple example: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s s.reindex(['e', 'b', 'f', 'd']) @@ -830,8 +835,8 @@ With a DataFrame, you can simultaneously reindex the index and columns: df df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) -For convenience, you may utilize the ``reindex_axis`` method, which takes the -labels and a keyword ``axis`` parameter. +For convenience, you may utilize the :meth:`~Series.reindex_axis` method, which +takes the labels and a keyword ``axis`` parameter. Note that the ``Index`` objects containing the actual axis labels can be **shared** between objects. So if we have a Series and a DataFrame, the @@ -869,8 +874,8 @@ Reindexing to align with another object You may wish to take an object and reindex its axes to be labeled the same as another object. While the syntax for this is straightforward albeit verbose, it -is a common enough operation that the ``reindex_like`` method is available to -make this simpler: +is a common enough operation that the :meth:`~DataFrame.reindex_like` method is +available to make this simpler: .. ipython:: python :suppress: @@ -885,15 +890,12 @@ make this simpler: df3 df.reindex_like(df2) -Reindexing with ``reindex_axis`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _basics.align: Aligning objects with each other with ``align`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ``align`` method is the fastest way to simultaneously align two objects. It +The :meth:`~Series.align` method is the fastest way to simultaneously align two objects. It supports a ``join`` argument (related to :ref:`joining and merging `): - ``join='outer'``: take the union of the indexes (default) @@ -905,7 +907,7 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) s1 = s[:4] s2 = s[1:] s1.align(s2) @@ -929,7 +931,7 @@ You can also pass an ``axis`` option to only align on the specified axis: .. _basics.align.frame.series: -If you pass a Series to ``DataFrame.align``, you can choose to align both +If you pass a Series to :meth:`DataFrame.align`, you can choose to align both objects either on the DataFrame's index or columns using the ``axis`` argument: .. ipython:: python @@ -941,8 +943,8 @@ objects either on the DataFrame's index or columns using the ``axis`` argument: Filling while reindexing ~~~~~~~~~~~~~~~~~~~~~~~~ -``reindex`` takes an optional parameter ``method`` which is a filling method -chosen from the following table: +:meth:`~Series.reindex` takes an optional parameter ``method`` which is a +filling method chosen from the following table: .. csv-table:: :header: "Method", "Action" @@ -956,8 +958,8 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = date_range('1/3/2000', periods=8) - ts = Series(randn(8), index=rng) + rng = pd.date_range('1/3/2000', periods=8) + ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 @@ -978,17 +980,17 @@ Note that the same result could have been achieved using ts2.reindex(ts.index).fillna(method='ffill') -``reindex`` will raise a ValueError if the index is not monotonic increasing or -descreasing. ``fillna`` and ``interpolate`` will not make any checks on the -order of the index. +:meth:`~Series.reindex` will raise a ValueError if the index is not monotonic +increasing or descreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` +will not make any checks on the order of the index. .. _basics.drop: Dropping labels from an axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A method closely related to ``reindex`` is the ``drop`` function. It removes a -set of labels from an axis: +A method closely related to ``reindex`` is the :meth:`~DataFrame.drop` function. +It removes a set of labels from an axis: .. ipython:: python @@ -1000,15 +1002,15 @@ Note that the following also works, but is a bit less obvious / clean: .. ipython:: python - df.reindex(df.index - ['a', 'd']) + df.reindex(df.index.difference(['a', 'd'])) .. _basics.rename: Renaming / mapping labels ~~~~~~~~~~~~~~~~~~~~~~~~~ -The ``rename`` method allows you to relabel an axis based on some mapping (a -dict or Series) or an arbitrary function. +The :meth:`~DataFrame.rename` method allows you to relabel an axis based on some +mapping (a dict or Series) or an arbitrary function. .. ipython:: python @@ -1024,14 +1026,14 @@ Series, it need only contain a subset of the labels as keys: df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) -The ``rename`` method also provides an ``inplace`` named parameter that is by -default ``False`` and copies the underlying data. Pass ``inplace=True`` to -rename the data in place. +The :meth:`~DataFrame.rename` method also provides an ``inplace`` named +parameter that is by default ``False`` and copies the underlying data. Pass +``inplace=True`` to rename the data in place. .. _basics.rename_axis: -The Panel class has a related ``rename_axis`` class which can rename any of -its three axes. +The Panel class has a related :meth:`~Panel.rename_axis` class which can rename +any of its three axes. Iteration --------- @@ -1055,8 +1057,8 @@ Thus, for example: iteritems ~~~~~~~~~ -Consistent with the dict-like interface, **iteritems** iterates through -key-value pairs: +Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates +through key-value pairs: * **Series**: (index, scalar value) pairs * **DataFrame**: (column, Series) pairs @@ -1078,8 +1080,8 @@ iterrows ~~~~~~~~ New in v0.7 is the ability to iterate efficiently through rows of a -DataFrame. It returns an iterator yielding each index value along with a Series -containing the data in each row: +DataFrame with :meth:`~DataFrame.iterrows`. It returns an iterator yielding each +index value along with a Series containing the data in each row: .. ipython:: @@ -1091,11 +1093,11 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) print(df2) print(df2.T) - df2_t = DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) print(df2_t) .. note:: @@ -1105,7 +1107,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df_iter = DataFrame([[1, 1.0]], columns=['x', 'y']) + df_iter = pd.DataFrame([[1, 1.0]], columns=['x', 'y']) row = next(df_iter.iterrows())[1] print(row['x'].dtype) print(df_iter['x'].dtype) @@ -1113,7 +1115,7 @@ For instance, a contrived way to transpose the DataFrame would be: itertuples ~~~~~~~~~~ -This method will return an iterator yielding a tuple for each row in the +The :meth:`~DataFrame.itertuples` method will return an iterator yielding a tuple for each row in the DataFrame. The first element of the tuple will be the row's corresponding index value, while the remaining values are the row values proper. @@ -1129,13 +1131,14 @@ For instance, .dt accessor ~~~~~~~~~~~~ -``Series`` has an accessor to succinctly return datetime like properties for the *values* of the Series, if its a datetime/period like Series. +``Series`` has an accessor to succinctly return datetime like properties for the +*values* of the Series, if its a datetime/period like Series. This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = Series(date_range('20130101 09:10:12',periods=4)) + s = pd.Series(pd.date_range('20130101 09:10:12',periods=4)) s s.dt.hour s.dt.second @@ -1166,7 +1169,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = Series(period_range('20130101',periods=4,freq='D')) + s = pd.Series(pd.period_range('20130101', periods=4,freq='D')) s s.dt.year s.dt.day @@ -1174,7 +1177,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = Series(timedelta_range('1 day 00:00:05',periods=4,freq='s')) + s = pd.Series(pd.timedelta_range('1 day 00:00:05',periods=4,freq='s')) s s.dt.days s.dt.seconds @@ -1195,7 +1198,7 @@ built-in string methods. For example: .. ipython:: python - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1213,7 +1216,7 @@ Sorting by index and value There are two obvious kinds of sorting that you may be interested in: sorting by label and sorting by actual values. The primary method for sorting axis -labels (indexes) across data structures is the ``sort_index`` method. +labels (indexes) across data structures is the :meth:`~DataFrame.sort_index` method. .. ipython:: python @@ -1223,13 +1226,13 @@ labels (indexes) across data structures is the ``sort_index`` method. unsorted_df.sort_index(ascending=False) unsorted_df.sort_index(axis=1) -``DataFrame.sort_index`` can accept an optional ``by`` argument for ``axis=0`` +:meth:`DataFrame.sort_index` can accept an optional ``by`` argument for ``axis=0`` which will use an arbitrary vector or a column name of the DataFrame to determine the sort order: .. ipython:: python - df1 = DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) df1.sort_index(by='two') The ``by`` argument can take a list of column names, e.g.: @@ -1238,7 +1241,7 @@ The ``by`` argument can take a list of column names, e.g.: df1[['one', 'two', 'three']].sort_index(by=['one','two']) -Series has the method ``order`` (analogous to `R's order function +Series has the method :meth:`~Series.order` (analogous to `R's order function `__) which sorts by value, with special treatment of NA values via the ``na_position`` argument: @@ -1251,21 +1254,21 @@ argument: .. note:: - ``Series.sort`` sorts a Series by value in-place. This is to provide + :meth:`Series.sort` sorts a Series by value in-place. This is to provide compatibility with NumPy methods which expect the ``ndarray.sort`` - behavior. ``Series.order`` returns a copy of the sorted data. + behavior. :meth:`Series.order` returns a copy of the sorted data. -Series has the ``searchsorted`` method, which works similar to -``np.ndarray.searchsorted``. +Series has the :meth:`~Series.searchsorted` method, which works similar to +:meth:`numpy.ndarray.searchsorted`. .. ipython:: python - ser = Series([1, 2, 3]) + ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) ser.searchsorted([1, 3], side='right') ser.searchsorted([1, 3], side='left') - ser = Series([3, 1, 2]) + ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) .. _basics.nsorted: @@ -1275,13 +1278,13 @@ smallest / largest values .. versionadded:: 0.14.0 -``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the +``Series`` has the :meth:`~Series.nsmallest` and :meth:`~Series.nlargest` methods which return the smallest or largest :math:`n` values. For a large ``Series`` this can be much faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - s = Series(np.random.permutation(10)) + s = pd.Series(np.random.permutation(10)) s s.order() s.nsmallest(3) @@ -1298,14 +1301,14 @@ all levels to ``by``. .. ipython:: python - df1.columns = MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) + df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) df1.sort_index(by=('a','two')) Copying ------- -The ``copy`` method on pandas objects copies the underlying data (though not +The :meth:`~DataFrame.copy` method on pandas objects copies the underlying data (though not the axis indexes, since they are immutable) and returns a new object. Note that **it is seldom necessary to copy objects**. For example, there are only a handful of ways to alter a DataFrame *in-place*: @@ -1324,23 +1327,24 @@ untouched. If data is modified, it is because you did so explicitly. dtypes ------ -The main types stored in pandas objects are ``float``, ``int``, ``bool``, ``datetime64[ns]``, ``timedelta[ns]``, -and ``object``. In addition these dtypes have item sizes, e.g. ``int64`` and ``int32``. A convenient ``dtypes`` +The main types stored in pandas objects are ``float``, ``int``, ``bool``, +``datetime64[ns]``, ``timedelta[ns]`` and ``object``. In addition these dtypes +have item sizes, e.g. ``int64`` and ``int32``. A convenient :attr:`~DataFrame.dtypes`` attribute for DataFrames returns a Series with the data type of each column. .. ipython:: python - dft = DataFrame(dict( A = np.random.rand(3), - B = 1, - C = 'foo', - D = Timestamp('20010102'), - E = Series([1.0]*3).astype('float32'), - F = False, - G = Series([1]*3,dtype='int8'))) + dft = pd.DataFrame(dict(A = np.random.rand(3), + B = 1, + C = 'foo', + D = pd.Timestamp('20010102'), + E = pd.Series([1.0]*3).astype('float32'), + F = False, + G = pd.Series([1]*3,dtype='int8'))) dft dft.dtypes -On a ``Series`` use the ``dtype`` method. +On a ``Series`` use the :attr:`~Series.dtype` attribute. .. ipython:: python @@ -1353,12 +1357,12 @@ general). .. ipython:: python # these ints are coerced to floats - Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.]) # string data forces an ``object`` dtype - Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6., 'foo']) -The method ``get_dtype_counts`` will return the number of columns of +The method :meth:`~DataFrame.get_dtype_counts` will return the number of columns of each type in a ``DataFrame``: .. ipython:: python @@ -1372,12 +1376,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 df1.dtypes - df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), - B = Series(randn(8)), - C = Series(np.array(randn(8),dtype='uint8')) )) + df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'), + B = pd.Series(np.random.randn(8)), + C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) )) df2 df2.dtypes @@ -1389,16 +1393,16 @@ By default integer types are ``int64`` and float types are ``float64``, .. ipython:: python - DataFrame([1, 2], columns=['a']).dtypes - DataFrame({'a': [1, 2]}).dtypes - DataFrame({'a': 1 }, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=['a']).dtypes + pd.DataFrame({'a': [1, 2]}).dtypes + pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes Numpy, however will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. .. ipython:: python - frame = DataFrame(np.array([1, 2])) + frame = pd.DataFrame(np.array([1, 2])) upcasting @@ -1426,7 +1430,7 @@ astype .. _basics.cast: -You can use the ``astype`` method to explicitly convert dtypes from one to another. These will by default return a copy, +You can use the :meth:`~DataFrame.astype` method to explicitly convert dtypes from one to another. These will by default return a copy, even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an exception if the astype operation is invalid. @@ -1444,7 +1448,7 @@ then the more *general* one will be used as the result of the operation. object conversion ~~~~~~~~~~~~~~~~~ -``convert_objects`` is a method to try to force conversion of types from the ``object`` dtype to other types. +:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types. To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise they will be set to ``np.nan``. @@ -1467,13 +1471,14 @@ but occasionally has non-dates intermixed and you want to represent as missing. .. ipython:: python - s = Series([datetime(2001,1,1,0,0), - 'foo', 1.0, 1, Timestamp('20010104'), - '20010105'],dtype='O') + import datetime + s = pd.Series([datetime.datetime(2001,1,1,0,0), + 'foo', 1.0, 1, pd.Timestamp('20010104'), + '20010105'], dtype='O') s s.convert_objects(convert_dates='coerce') -In addition, ``convert_objects`` will attempt the *soft* conversion of any *object* dtypes, meaning that if all +In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all the objects in a Series are of the same type, the Series will have that dtype. gotchas @@ -1513,29 +1518,29 @@ Selecting columns based on ``dtype`` .. versionadded:: 0.14.1 -The :meth:`~pandas.DataFrame.select_dtypes` method implements subsetting of columns +The :meth:`~DataFrame.select_dtypes` method implements subsetting of columns based on their ``dtype``. -First, let's create a :class:`~pandas.DataFrame` with a slew of different +First, let's create a :class:`DataFrame` with a slew of different dtypes: .. ipython:: python - df = DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, - 'category': pd.Categorical(list("ABC"))}) + df = pd.DataFrame({'string': list('abc'), + 'int64': list(range(1, 4)), + 'uint8': np.arange(3, 6).astype('u1'), + 'float64': np.arange(4.0, 7.0), + 'bool1': [True, False, True], + 'bool2': [False, True, False], + 'dates': pd.date_range('now', periods=3).values, + 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') df['other_dates'] = pd.date_range('20130101', periods=3).values df -``select_dtypes`` has two parameters ``include`` and ``exclude`` that allow you to +:meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to say "give me the columns WITH these dtypes" (``include``) and/or "give the columns WITHOUT these dtypes" (``exclude``). diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index d03e0fb117c5c..0c63759201517 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -6,14 +6,10 @@ :suppress: import numpy as np - import random - import os - np.random.seed(123456) - from pandas import options - from pandas import * import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - options.display.max_rows=15 + pd.options.display.max_rows = 15 **************** @@ -23,11 +19,11 @@ Categorical Data .. versionadded:: 0.15 .. note:: - While there was in `pandas.Categorical` in earlier versions, the ability to use + While there was `pandas.Categorical` in earlier versions, the ability to use categorical data in `Series` and `DataFrame` is new. -This is a introduction to pandas categorical data type, including a short comparison +This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. `Categoricals` are a pandas data type, which correspond to categorical variables in @@ -65,14 +61,14 @@ By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = df["A"].astype('category') df @@ -80,7 +76,7 @@ By using some special functions: .. ipython:: python - df = DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) @@ -92,11 +88,11 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to .. ipython:: python - raw_cat = Categorical(["a","b","c","a"], categories=["b","c","d"], + raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], ordered=False) - s = Series(raw_cat) + s = pd.Series(raw_cat) s - df = DataFrame({"A":["a","b","c","a"]}) + df = pd.DataFrame({"A":["a","b","c","a"]}) df["B"] = raw_cat df @@ -104,7 +100,7 @@ You can also specify differently ordered categories or make the resulting data o .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s_cat = s.astype("category", categories=["b","c","d"], ordered=False) s_cat @@ -129,7 +125,7 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina .. ipython:: python - s = Series(["a","b","c","a"]) + s = pd.Series(["a","b","c","a"]) s s2 = s.astype('category') s2 @@ -143,7 +139,7 @@ constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = Series(Categorical.from_codes(splitter, categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- @@ -153,8 +149,8 @@ Using ``.describe()`` on categorical data will produce similar output to a `Seri .. ipython:: python - cat = Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) - df = DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) + df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() df["cat"].describe() @@ -168,7 +164,7 @@ passed in values. .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s.cat.categories s.cat.ordered @@ -176,7 +172,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = Series(Categorical(["a","b","c","a"], categories=["c","b","a"])) + s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) s.cat.categories s.cat.ordered @@ -194,7 +190,7 @@ by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = Series(["a","b","c","a"], dtype="category") + s = pd.Series(["a","b","c","a"], dtype="category") s s.cat.categories = ["Group %s" % g for g in s.cat.categories] s @@ -247,7 +243,7 @@ Removing unused categories can also be done: .. ipython:: python - s = Series(Categorical(["a","b","a"], categories=["a","b","c","d"])) + s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) s s.cat.remove_unused_categories() @@ -259,7 +255,7 @@ or simply set the categories to a predefined scale, use :func:`Categorical.set_c .. ipython:: python - s = Series(["one","two","four", "-"], dtype="category") + s = pd.Series(["one","two","four", "-"], dtype="category") s s = s.cat.set_categories(["one","two","three","four"]) s @@ -276,16 +272,16 @@ Sorting and Order .. warning:: - The default for construction has change in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` + The default for construction has changed in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a `TypeError`. .. ipython:: python - s = Series(Categorical(["a","b","c","a"], ordered=False)) + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort() - s = Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) s.sort() s s.min(), s.max() @@ -303,7 +299,7 @@ This is even true for strings and numeric data: .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.set_categories([2,3,1], ordered=True) s s.sort() @@ -321,7 +317,7 @@ necessarily make the sort order the same as the categories order. .. ipython:: python - s = Series([1,2,3,1], dtype="category") + s = pd.Series([1,2,3,1], dtype="category") s = s.cat.reorder_categories([2,3,1], ordered=True) s s.sort() @@ -347,15 +343,15 @@ Multi Column Sorting ~~~~~~~~~~~~~~~~~~~~ A categorical dtyped column will partcipate in a multi-column sort in a similar manner to other columns. -The ordering of the categorical is determined by the ``categories`` of that columns. +The ordering of the categorical is determined by the ``categories`` of that column. .. ipython:: python - dfs = DataFrame({'A' : Categorical(list('bbeebbaa'),categories=['e','a','b'],ordered=True), - 'B' : [1,2,1,2,2,1,2,1] }) - dfs.sort(['A','B']) + dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), + 'B' : [1,2,1,2,2,1,2,1] }) + dfs.sort(['A', 'B']) -Reordering the ``categories``, changes a future sort. +Reordering the ``categories`` changes a future sort. .. ipython:: python @@ -380,14 +376,14 @@ categories or a categorical with any list-like object, will raise a TypeError. Any "non-equality" comparisons of categorical data with a `Series`, `np.array`, `list` or categorical data with different categories or ordering will raise an `TypeError` because custom - categories ordering could be interpreted in two ways: one with taking in account the + categories ordering could be interpreted in two ways: one with taking into account the ordering and one without. .. ipython:: python - cat = Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) + cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) + cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) cat cat_base @@ -443,19 +439,19 @@ present in the data: .. ipython:: python - s = Series(Categorical(["a","b","c","c"], categories=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) - df = DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) + df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df2 = DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) + cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -463,15 +459,15 @@ Pivot tables: .. ipython:: python - raw_cat = Categorical(["a","a","b","b"], categories=["a","b","c"]) - df = DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) + df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) Data munging ------------ The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, -work as normal, the only difference is the return type (for getting) and +work as normal. The only difference is the return type (for getting) and that only values already in `categories` can be assigned. Getting @@ -482,10 +478,10 @@ the ``category`` dtype is preserved. .. ipython:: python - idx = Index(["h","i","j","k","l","m","n",]) - cats = Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) + idx = pd.Index(["h","i","j","k","l","m","n",]) + cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] df.iloc[2:4,:].dtypes df.loc["h":"j","cats"] @@ -527,10 +523,10 @@ Setting values in a categorical column (or `Series`) works as long as the value .. ipython:: python - idx = Index(["h","i","j","k","l","m","n"]) - cats = Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] - df = DataFrame({"cats":cats,"values":values}, index=idx) + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] = [["b",2],["b",2]] df @@ -543,10 +539,10 @@ Setting values by assigning categorical data will also check that the `categorie .. ipython:: python - df.loc["j":"k","cats"] = Categorical(["a","a"], categories=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = Categorical(["b","b"], categories=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -554,9 +550,9 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python - df = DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = Categorical(["b","b"], categories=["a","b"]) - df.loc[2:3,"b"] = Categorical(["b","b"], categories=["a","b"]) + df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -569,9 +565,9 @@ but the categories of these categoricals need to be the same: .. ipython:: python - cat = Series(["a","b"], dtype="category") + cat = pd.Series(["a","b"], dtype="category") vals = [1,2] - df = DataFrame({"cats":cat, "vals":vals}) + df = pd.DataFrame({"cats":cat, "vals":vals}) res = pd.concat([df,df]) res res.dtypes @@ -611,12 +607,12 @@ relevant columns back to `category` and assign the right categories and categori .. ipython:: python - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) - df = DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) + df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) df2 = pd.read_csv(StringIO(csv.getvalue())) @@ -643,10 +639,10 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = Series(["a","b",np.nan,"a"], dtype="category") + s = pd.Series(["a","b",np.nan,"a"], dtype="category") # only two categories s - s2 = Series(["a","b","c","a"], dtype="category") + s2 = pd.Series(["a","b","c","a"], dtype="category") s2.cat.categories = [1,2,np.nan] # three categories, np.nan included s2 @@ -660,11 +656,11 @@ available ("missing value") or `np.nan` is a valid category. .. ipython:: python - c = Series(["a","b",np.nan], dtype="category") + c = pd.Series(["a","b",np.nan], dtype="category") c.cat.set_categories(["a","b",np.nan], inplace=True) # will be inserted as a NA category: c[0] = np.nan - s = Series(c) + s = pd.Series(c) s pd.isnull(s) s.fillna("a") @@ -697,7 +693,7 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = Series(['foo','bar']*1000) + s = pd.Series(['foo','bar']*1000) # object dtype s.nbytes @@ -707,12 +703,12 @@ an ``object`` dtype is a constant times the length of the data. .. note:: - If the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an - equivalent ``object`` dtype representation. + If the number of categories approaches the length of the data, the ``Categorical`` will use nearly the same or + more memory than an equivalent ``object`` dtype representation. .. ipython:: python - s = Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(['foo%04d' % i for i in range(2000)]) # object dtype s.nbytes @@ -734,7 +730,7 @@ will work with the current pandas version, resulting in subtle bugs: .. code-block:: python - >>> cat = Categorical([1,2], [1,2,3]) + >>> cat = pd.Categorical([1,2], [1,2,3]) >>> # old version >>> cat.get_values() array([2, 3], dtype=int64) @@ -762,7 +758,7 @@ object and not as a low-level `numpy` array dtype. This leads to some problems. except TypeError as e: print("TypeError: " + str(e)) - dtype = Categorical(["a"]).dtype + dtype = pd.Categorical(["a"]).dtype try: np.dtype(dtype) except TypeError as e: @@ -780,15 +776,15 @@ To check if a Series contains Categorical data, with pandas 0.16 or later, use .. ipython:: python - hasattr(Series(['a'], dtype='category'), 'cat') - hasattr(Series(['a']), 'cat') + hasattr(pd.Series(['a'], dtype='category'), 'cat') + hasattr(pd.Series(['a']), 'cat') Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python - s = Series(Categorical([1,2,3,4])) + s = pd.Series(pd.Categorical([1,2,3,4])) try: np.sum(s) #same with np.log(s),.. @@ -807,33 +803,36 @@ basic type) and applying along columns will also convert to object. .. ipython:: python - df = DataFrame({"a":[1,2,3,4], - "b":["a","b","c","d"], - "cats":Categorical([1,2,3,2])}) + df = pd.DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":pd.Categorical([1,2,3,2])}) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) -No Categorical Index -~~~~~~~~~~~~~~~~~~~~ +Categorical Index +~~~~~~~~~~~~~~~~~ -There is currently no index of type ``category``, so setting the index to categorical column will -convert the categorical data to a "normal" dtype first and therefore remove any custom -ordering of the categories: +.. versionadded:: 0.16.1 + +A new ``CategoricalIndex`` index type is introduced in version 0.16.1. See the +:ref:`advanced indexing docs ` for a more detailed +explanation. + +Setting the index, will create create a ``CategoricalIndex`` .. ipython:: python - cats = Categorical([1,2,3,4], categories=[4,2,3,1]) + cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) strings = ["a","b","c","d"] values = [4,2,3,1] - df = DataFrame({"strings":strings, "values":values}, index=cats) + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) df.index - # This should sort by categories but does not as there is no CategoricalIndex! + # This now sorts by the categories order df.sort_index() -.. note:: - This could change if a `CategoricalIndex` is implemented (see - https://github.com/pydata/pandas/issues/7629) - +In previous versions (<0.16.1) there is no index of type ``category``, so +setting the index to categorical column will convert the categorical data to a +"normal" dtype first and therefore remove any custom ordering of the categories. Side Effects ~~~~~~~~~~~~ @@ -843,12 +842,12 @@ means that changes to the `Series` will in most cases change the original `Categ .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat") + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat - df = DataFrame(s) + df = pd.DataFrame(s) df["cat"].cat.categories = [1,2,3,4,5] cat @@ -856,8 +855,8 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical .. ipython:: python - cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) - s = Series(cat, name="cat", copy=True) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 cat diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 4b0fe39d929a9..dfb9fab19bf31 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -1,23 +1,22 @@ .. currentmodule:: pandas -.. _computation: .. ipython:: python :suppress: import numpy as np np.random.seed(123456) - from pandas import * - import pandas.util.testing as tm - randn = np.random.randn np.set_printoptions(precision=4, suppress=True) + import pandas as pd import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' + pd.options.display.mpl_style = 'default' import matplotlib.pyplot as plt plt.close('all') - options.display.max_rows=15 + pd.options.display.max_rows=15 + +.. _computation: Computational tools =================== @@ -36,13 +35,13 @@ NA/null values *before* computing the percent change). .. ipython:: python - ser = Series(randn(8)) + ser = pd.Series(np.random.randn(8)) ser.pct_change() .. ipython:: python - df = DataFrame(randn(10, 4)) + df = pd.DataFrame(np.random.randn(10, 4)) df.pct_change(periods=3) @@ -56,8 +55,8 @@ The ``Series`` object has a method ``cov`` to compute covariance between series .. ipython:: python - s1 = Series(randn(1000)) - s2 = Series(randn(1000)) + s1 = pd.Series(np.random.randn(1000)) + s2 = pd.Series(np.random.randn(1000)) s1.cov(s2) Analogously, ``DataFrame`` has a method ``cov`` to compute pairwise covariances @@ -78,7 +77,7 @@ among the series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -87,7 +86,7 @@ in order to have a valid result. .. ipython:: python - frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) frame.ix[:5, 'a'] = np.nan frame.ix[5:10, 'b'] = np.nan @@ -123,7 +122,7 @@ All of these are currently computed using pairwise complete observations. .. ipython:: python - frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) frame.ix[::2] = np.nan # Series with Series @@ -140,7 +139,7 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) frame.ix[:5, 'a'] = np.nan frame.ix[5:10, 'b'] = np.nan @@ -157,8 +156,8 @@ objects. index = ['a', 'b', 'c', 'd', 'e'] columns = ['one', 'two', 'three', 'four'] - df1 = DataFrame(randn(5, 4), index=index, columns=columns) - df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) + df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) df2.corrwith(df1, axis=1) @@ -172,7 +171,7 @@ of the ranks (by default) for the group: .. ipython:: python - s = Series(np.random.randn(5), index=list('abcde')) + s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) s['d'] = s['b'] # so there's a tie s.rank() @@ -181,7 +180,7 @@ or the columns (``axis=1``). ``NaN`` values are excluded from the ranking. .. ipython:: python - df = DataFrame(np.random.randn(10, 6)) + df = pd.DataFrame(np.random.np.random.randn(10, 6)) df[4] = df[2][:5] # some ties df df.rank(1) @@ -253,13 +252,13 @@ These functions can be applied to ndarrays or Series objects: .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot(style='k--') @savefig rolling_mean_ex.png - rolling_mean(ts, 60).plot(style='k') + pd.rolling_mean(ts, 60).plot(style='k') They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: @@ -271,12 +270,12 @@ sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() @savefig rolling_mean_frame.png - rolling_sum(df, 60).plot(subplots=True) + pd.rolling_sum(df, 60).plot(subplots=True) The ``rolling_apply`` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function @@ -287,7 +286,7 @@ compute the mean absolute deviation on a rolling basis: mad = lambda x: np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - rolling_apply(ts, 60, mad).plot(style='k') + pd.rolling_apply(ts, 60, mad).plot(style='k') The ``rolling_window`` function performs a generic rolling window computation on the input data. The weights used in the window are specified by the ``win_type`` @@ -310,23 +309,23 @@ keyword. The list of recognized types are: .. ipython:: python - ser = Series(randn(10), index=date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) - rolling_window(ser, 5, 'triang') + pd.rolling_window(ser, 5, 'triang') Note that the ``boxcar`` window is equivalent to ``rolling_mean``. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_mean(ser, 5) + pd.rolling_mean(ser, 5) For some windowing functions, additional parameters must be specified: .. ipython:: python - rolling_window(ser, 5, 'gaussian', std=0.1) + pd.rolling_window(ser, 5, 'gaussian', std=0.1) By default the labels are set to the right edge of the window, but a ``center`` keyword is available so the labels can be set at the center. @@ -334,11 +333,11 @@ This keyword is available in other rolling functions as well. .. ipython:: python - rolling_window(ser, 5, 'boxcar') + pd.rolling_window(ser, 5, 'boxcar') - rolling_window(ser, 5, 'boxcar', center=True) + pd.rolling_window(ser, 5, 'boxcar', center=True) - rolling_mean(ser, 5, center=True) + pd.rolling_mean(ser, 5, center=True) .. _stats.moments.normalization: @@ -377,7 +376,7 @@ For example: .. ipython:: python df2 = df[:20] - rolling_corr(df2, df2['B'], window=5) + pd.rolling_corr(df2, df2['B'], window=5) .. _stats.moments.corr_pairwise: @@ -402,12 +401,12 @@ can even be omitted: .. ipython:: python - covs = rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) + covs = pd.rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) covs[df.index[-50]] .. ipython:: python - correls = rolling_corr(df, 50) + correls = pd.rolling_corr(df, 50) correls[df.index[-50]] .. note:: @@ -441,9 +440,9 @@ they are implemented in pandas such that the following two calls are equivalent: .. ipython:: python - rolling_mean(df, window=len(df), min_periods=1)[:5] + pd.rolling_mean(df, window=len(df), min_periods=1)[:5] - expanding_mean(df)[:5] + pd.expanding_mean(df)[:5] Like the ``rolling_`` functions, the following methods are included in the ``pandas`` namespace or can be located in ``pandas.stats.moments``. @@ -502,7 +501,7 @@ relative impact of an individual data point. As an example, here is the ts.plot(style='k--') @savefig expanding_mean_frame.png - expanding_mean(ts).plot(style='k') + pd.expanding_mean(ts).plot(style='k') .. _stats.moments.exponentially_weighted: @@ -584,7 +583,7 @@ Here is an example for a univariate time series: ts.plot(style='k--') @savefig ewma_ex.png - ewma(ts, span=20).plot(style='k') + pd.ewma(ts, span=20).plot(style='k') All the EW functions have a ``min_periods`` argument, which has the same meaning it does for all the ``expanding_`` and ``rolling_`` functions: diff --git a/doc/source/conf.py b/doc/source/conf.py index fcb9c3fdd0016..08fc8483762ab 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -211,7 +211,30 @@ # Additional templates that should be rendered to pages, maps page names to # template names. -# html_additional_pages = {} + +# Add redirect for previously existing API pages (which are now included in +# the API pages as top-level functions) based on a template (GH9911) +moved_api_pages = [ + 'pandas.core.common.isnull', 'pandas.core.common.notnull', 'pandas.core.reshape.get_dummies', + 'pandas.tools.merge.concat', 'pandas.tools.merge.merge', 'pandas.tools.pivot.pivot_table', + 'pandas.tseries.tools.to_datetime', 'pandas.io.clipboard.read_clipboard', 'pandas.io.excel.ExcelFile.parse', + 'pandas.io.excel.read_excel', 'pandas.io.html.read_html', 'pandas.io.json.read_json', + 'pandas.io.parsers.read_csv', 'pandas.io.parsers.read_fwf', 'pandas.io.parsers.read_table', + 'pandas.io.pickle.read_pickle', 'pandas.io.pytables.HDFStore.append', 'pandas.io.pytables.HDFStore.get', + 'pandas.io.pytables.HDFStore.put', 'pandas.io.pytables.HDFStore.select', 'pandas.io.pytables.read_hdf', + 'pandas.io.sql.read_sql', 'pandas.io.sql.read_frame', 'pandas.io.sql.write_frame', + 'pandas.io.stata.read_stata', 'pandas.stats.moments.ewma', 'pandas.stats.moments.ewmcorr', + 'pandas.stats.moments.ewmcov', 'pandas.stats.moments.ewmstd', 'pandas.stats.moments.ewmvar', + 'pandas.stats.moments.expanding_apply', 'pandas.stats.moments.expanding_corr', 'pandas.stats.moments.expanding_count', + 'pandas.stats.moments.expanding_cov', 'pandas.stats.moments.expanding_kurt', 'pandas.stats.moments.expanding_mean', + 'pandas.stats.moments.expanding_median', 'pandas.stats.moments.expanding_quantile', 'pandas.stats.moments.expanding_skew', + 'pandas.stats.moments.expanding_std', 'pandas.stats.moments.expanding_sum', 'pandas.stats.moments.expanding_var', + 'pandas.stats.moments.rolling_apply', 'pandas.stats.moments.rolling_corr', 'pandas.stats.moments.rolling_count', + 'pandas.stats.moments.rolling_cov', 'pandas.stats.moments.rolling_kurt', 'pandas.stats.moments.rolling_mean', + 'pandas.stats.moments.rolling_median', 'pandas.stats.moments.rolling_quantile', 'pandas.stats.moments.rolling_skew', + 'pandas.stats.moments.rolling_std', 'pandas.stats.moments.rolling_sum', 'pandas.stats.moments.rolling_var'] + +html_additional_pages = {'generated/' + page: 'api_redirect.html' for page in moved_api_pages} # If false, no module index is generated. html_use_modindex = True diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index b3b2d272e66c6..1f58992dba017 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -17,8 +17,8 @@ If you are simply looking to start working with the *pandas* codebase, navigate `GitHub "issues" tab `_ and start looking through interesting issues. There are a number of issues listed under `Docs `_ -and `Good as first PR -`_ +and `Difficulty Novice +`_ where you could start out. Or maybe through using *pandas* you have an idea of you own or are looking for something @@ -96,6 +96,8 @@ Getting Started with Git setting up your SSH key, and configuring git. All these steps need to be completed before working seamlessly with your local repository and GitHub. +.. _contributing.forking: + Forking ------- @@ -110,9 +112,11 @@ want to clone your fork to your machine: :: This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. -You will also need to hook up Travis-CI to your GitHub repository so the suite -is automatically run when a Pull Request is submitted. Instructions are `here -`_. +The testing suite will run automatically on Travis-CI once your Pull Request is +submitted. However, if you wish to run the test suite on a branch prior to +submitting the Pull Request, then Travis-CI needs to be hooked up to your +GitHub repository. Instructions are for doing so are `here +`__. Creating a Branch ----------------- @@ -132,6 +136,95 @@ changes in this branch specific to one bug or feature so it is clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. +To update this branch, you need to retrieve the changes from the master branch:: + + git fetch upstream + git rebase upstream/master + +This will replay your commits on top of the lastest pandas git master. If this +leads to merge conflicts, you must resolve these before submitting your Pull +Request. If you have uncommitted changes, you will need to `stash` them prior +to updating. This will effectively store your changes and they can be reapplied +after updating. + +.. _contributing.dev_env: + +Creating a Development Environment +---------------------------------- + +An easy way to create a *pandas* development environment is as follows. + +- Install either :ref:`Install Anaconda ` or :ref:`Install miniconda ` +- Make sure that you have :ref:`cloned the repository ` +- ``cd`` to the pandas source directory + +Tell ``conda`` to create a new environment, named ``pandas_dev``, or any name you would like for this environment by running: + +:: + + conda create -n pandas_dev --file ci/requirements_dev.txt + + +For a python 3 environment + +:: + + conda create -n pandas_dev python=3 --file ci/requirements_dev.txt + + +If you are on ``windows``, then you will need to install the compiler linkages: + +:: + + conda install -n pandas_dev libpython + +This will create the new environment, and not touch any of your existing environments, nor any existing python installation. It will install all of the basic dependencies of *pandas*, as well as the development and testing tools. If you would like to install other dependencies, you can install them as follows: + +:: + + conda install -n pandas_dev -c pandas pytables scipy + +To install *all* pandas dependencies you can do the following: + +:: + + conda install -n pandas_dev -c pandas --file ci/requirements_all.txt + +To work in this environment, ``activate`` it as follows: + +:: + + activate pandas_dev + +At which point, the prompt will change to indicate you are in the new development environment. + +.. note:: + + The above syntax is for ``windows`` environments. To work on ``macosx/linux``, use: + + :: + + source activate pandas_dev + +To view your environments: + +:: + + conda info -e + +To return to you home root environment: + +:: + + deactivate + +See the full ``conda`` docs `here +`__. + +At this point you can easily do an *in-place* install, as detailed in the next section. + +.. _contributing.getting_source: + Making changes -------------- @@ -237,9 +330,15 @@ follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of ``numpydoc`` is included in the *pandas* source code. +It is easiest to :ref:`create a development environment `, then install: + +:: + + conda install -n pandas_dev sphinx ipython + Furthermore, it is recommended to have all `optional dependencies `_ -installed. This is not needed, but be aware that you will see some error +installed. This is not strictly necessary, but be aware that you will see some error messages. Because all the code in the documentation is executed during the doc build, the examples using this optional dependencies will generate errors. Run ``pd.show_versions()`` to get an overview of the installed version of all @@ -252,7 +351,7 @@ dependencies. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ -So how do you build the docs? Navigate to your local the folder +So how do you build the docs? Navigate to your local the folder ``pandas/doc/`` directory in the console and run:: python make.py html @@ -272,8 +371,9 @@ If you want to do a full clean build, do:: Starting with 0.13.1 you can tell ``make.py`` to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete `.rst` files that aren't required, since the -last committed version can always be restored from git. +You will be prompted to delete `.rst` files that aren't required. This is okay +since the prior version can be checked out from git, but make sure to +not commit the file deletions. :: @@ -295,6 +395,13 @@ browser to see the full documentation you just built:: And you'll have the satisfaction of seeing your new and improved documentation! +.. _contributing.dev_docs: + +Built Master Branch Documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When pull-requests are merged into the pandas *master* branch, the main parts of the documentation are +also built by Travis-CI. These docs are then hosted `here `__. Contributing to the code base ============================= @@ -324,7 +431,7 @@ deprecation warnings where needed. Test-driven Development/Writing Code ------------------------------------ -*Pandas* is serious about `Test-driven Development (TDD) +*Pandas* is serious about testing and strongly encourages individuals to embrace `Test-driven Development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired @@ -457,8 +564,8 @@ Doing 'git status' again should give something like :: # modified: /relative/path/to/file-you-added.py # -Finally, commit your changes to your local repository with an explanatory message. An informal -commit message format is in effect for the project. Please try to adhere to it. Here are +Finally, commit your changes to your local repository with an explanatory message. *Pandas* +uses a convention for commit message prefixes and layout. Here are some common prefixes along with general guidelines for when to use them: * ENH: Enhancement, new functionality @@ -572,6 +679,3 @@ branch has not actually been merged. The branch will still exist on GitHub, so to delete it there do :: git push origin --delete shiny-new-feature - - - diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 0e6386955a653..f69f926296020 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1006,6 +1006,9 @@ The :ref:`HDFStores ` docs `Merging on-disk tables with millions of rows `__ +`Avoiding inconsistencies when writing to a store from multiple processes/threads +`__ + De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from csv file and creating a store by chunks, with date parsing as well. `See here diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index e1c14029f1cf9..9221f2685d79b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -461,7 +461,7 @@ Inspired by `dplyr's `__ ``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign` method that allows you to easily create new columns that are potentially -derived from existing columns. +derived from existing columns. .. ipython:: python @@ -511,7 +511,9 @@ DataFrame is returned, with the new values inserted. .. warning:: Since the function signature of ``assign`` is ``**kwargs``, a dictionary, - the order of the new columns in the resulting DataFrame cannot be guaranteed. + the order of the new columns in the resulting DataFrame cannot be guaranteed + to match the order you pass in. To make things predictable, items are inserted + alphabetically (by key) at the end of the DataFrame. All expressions are computed first, and then assigned. So you can't refer to another column being assigned in the same call to ``assign``. For example: @@ -575,10 +577,8 @@ row-wise. For example: df - df.iloc[0] -In the special case of working with time series data, if the Series is a -TimeSeries (which it will be automatically if the index contains datetime -objects), and the DataFrame index also contains dates, the broadcasting will be -column-wise: +In the special case of working with time series data, and the DataFrame index +also contains dates, the broadcasting will be column-wise: .. ipython:: python :okwarning: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 4a0743b8be3e4..c70b6deade36e 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -57,7 +57,7 @@ large data to thin clients. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. -Based on `"The Grammer of Graphics" `__ it +Based on `"The Grammar of Graphics" `__ it provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. It's really quite incredible. Various implementations to other languages are available, but a faithful implementation for python users has long been missing. Although still young @@ -137,6 +137,24 @@ PyDatastream is a Python interface to the SOAP API to return indexed Pandas DataFrames or Panels with financial data. This package requires valid credentials for this API (non free). +`pandaSDMX `_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +pandaSDMX is an extensible library to retrieve and acquire statistical data +and metadata disseminated in +`SDMX `_ 2.1. This standard is currently supported by +the European statistics office (Eurostat) +and the European Central Bank (ECB). Datasets may be returned as pandas Series +or multi-indexed DataFrames. + +`fredapi `_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +fredapi is a Python interface to the `Federal Reserve Economic Data (FRED) `__ +provided by the Federal Reserve Bank of St. Louis. It works with both the FRED database and ALFRED database that +contains point-in-time data (i.e. historic data revisions). fredapi provides a wrapper in python to the FRED +HTTP API, and also provides several conveninent methods for parsing and analyzing point-in-time data from ALFRED. +fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that +you can obtain for free on the FRED website. + .. _ecosystem.domain: diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index e6b735173110b..d007446a5b922 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -66,7 +66,7 @@ Here's the function in pure python: s += f(a + i * dx) return s * dx -We achieve our result by by using ``apply`` (row-wise): +We achieve our result by using ``apply`` (row-wise): .. ipython:: python @@ -86,7 +86,7 @@ hence we'll concentrate our efforts cythonizing these two functions. .. note:: In python 2 replacing the ``range`` with its generator counterpart (``xrange``) - would mean the ``range`` line would vanish. In python 3 range is already a generator. + would mean the ``range`` line would vanish. In python 3 ``range`` is already a generator. .. _enhancingperf.plain: @@ -248,7 +248,7 @@ efforts here. More advanced techniques ~~~~~~~~~~~~~~~~~~~~~~~~ -There is still scope for improvement, here's an example of using some more +There is still hope for improvement. Here's an example of using some more advanced cython techniques: .. ipython:: @@ -373,7 +373,7 @@ This Python syntax is **not** allowed: :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`pandas.eval` works well with expressions containing large arrays +:func:`pandas.eval` works well with expressions containing large arrays. First let's create a few decent-sized arrays to play with: diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 467ec02b55f20..1fc8488e92fde 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -207,9 +207,9 @@ properties. Here are the pandas equivalents: Frequency conversion ~~~~~~~~~~~~~~~~~~~~ -Frequency conversion is implemented using the ``resample`` method on TimeSeries -and DataFrame objects (multiple time series). ``resample`` also works on panels -(3D). Here is some code that resamples daily data to monthly: +Frequency conversion is implemented using the ``resample`` method on Series +and DataFrame objects with a DatetimeIndex or PeriodIndex. ``resample`` also +works on panels (3D). Here is some code that resamples daily data to montly: .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7ad2641dec52a..c9e18b585c764 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -NA group handling -~~~~~~~~~~~~~~~~~ +NA and NaT group handling +~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN values in the grouping key, these will be automatically -excluded. So there will never be an "NA group". This was not the case in older +If there are any NaN or NaT values in the grouping key, these will be automatically +excluded. So there will never be an "NA group" or "NaT group". This was not the case in older versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index ee779715bcb95..fb63d0c6d66f1 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -115,6 +115,7 @@ See the package overview for more detail about what's in the library. {%if not single -%} whatsnew install + contributing faq overview 10min @@ -149,7 +150,6 @@ See the package overview for more detail about what's in the library. api {% endif -%} {%if not single -%} - contributing internals release {% endif -%} diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index fc074802353ee..a1912032bc3bf 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -30,9 +30,9 @@ The axis labeling information in pandas objects serves many purposes: In this section, we will focus on the final point: namely, how to slice, dice, and generally get and set subsets of pandas objects. The primary focus will be on Series and DataFrame as they have received more development attention in -this area. Expect more work to be invested higher-dimensional data structures -(including ``Panel``) in the future, especially in label-based advanced -indexing. +this area. Expect more work to be invested in higher-dimensional data +structures (including ``Panel``) in the future, especially in label-based +advanced indexing. .. note:: @@ -54,7 +54,7 @@ indexing. .. warning:: - In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + In 0.15.0 ``Index`` has internally been refactored to no longer subclass ``ndarray`` but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) @@ -225,9 +225,9 @@ new column. sa.a = 5 sa - dfa.A = list(range(len(dfa.index))) # ok if A already exists + dfa.A = list(range(len(dfa.index))) # ok if A already exists dfa - dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column + dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column dfa .. warning:: @@ -249,6 +249,14 @@ new column. If you are using the IPython environment, you may also use tab-completion to see these accessible attributes. +You can also assign a ``dict`` to a row of a ``DataFrame``: + +.. ipython:: python + + x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + x.iloc[1] = dict(x=9, y=99) + x + Slicing ranges -------------- @@ -314,7 +322,7 @@ Selection By Label dfl.loc['20130102':'20130104'] pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. -**at least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. +**At least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: @@ -500,6 +508,81 @@ A list of indexers where any element is out of bounds will raise an .. _indexing.basics.partial_setting: +Selecting Random Samples +------------------------ +.. versionadded::0.16.1 + +A random selection of rows or columns from a Series, DataFrame, or Panel with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + + # When no arguments are passed, returns 1 row. + s.sample() + + # One may specify either a number of rows: + s.sample(n=3) + + # Or a fraction of the rows: + s.sample(frac=0.5) + +By default, ``sample`` will return each row at most once, but one can also sample with replacement +using the ``replace`` option: + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + + # Without replacement (default): + s.sample(n=6, replace=False) + + # With replacement: + s.sample(n=6, replace=True) + + +By default, each row has an equal probability of being selected, but if you want rows +to have different probabilities, you can pass the ``sample`` function sampling weights as +``weights``. These weights can be a list, a numpy array, or a Series, but they must be of the same length as the object you are sampling. Missing values will be treated as a weight of zero, and inf values are not allowed. If weights do not sum to 1, they will be re-normalized by dividing all weights by the sum of the weights. For example: + +.. ipython :: python + + s = Series([0,1,2,3,4,5]) + example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] + s.sample(n=3, weights=example_weights) + + # Weights will be re-normalized automatically + example_weights2 = [0.5, 0, 0, 0, 0, 0] + s.sample(n=1, weights=example_weights2) + +When applied to a DataFrame, you can use a column of the DataFrame as sampling weights +(provided you are sampling rows and not columns) by simply passing the name of the column +as a string. + +.. ipython :: python + + df2 = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) + df2.sample(n = 3, weights = 'weight_column') + +``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. + +.. ipython :: python + + df3 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + df3.sample(n=1, axis=1) + +Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a numpy RandomState object. + +.. ipython :: python + + df4 = DataFrame({'col1':[1,2,3], 'col2':[2,3,4]}) + + # With a given seed, the sample will always draw the same rows. + df4.sample(n=2, random_state=2) + df4.sample(n=2, random_state=2) + + + Setting With Enlargement ------------------------ @@ -578,9 +661,10 @@ Using a boolean vector to index a Series works exactly as in a numpy ndarray: .. ipython:: python + s = Series(range(-3, 4)) + s s[s > 0] - s[(s < 0) & (s > -0.5)] - s[(s < -1) | (s > 1 )] + s[(s < -1) | (s > 0.5)] s[~(s < 0)] You may select rows from a DataFrame using a boolean vector the same length as diff --git a/doc/source/install.rst b/doc/source/install.rst index dd9021d0439dc..b3f86db5e3e59 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -35,6 +35,8 @@ pandas at all. Simply create an account, and have access to pandas from within your brower via an `IPython Notebook `__ in a few minutes. +.. _install.anaconda: + Installing pandas with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -66,6 +68,8 @@ admin rights to install it, it will install in the user's home directory, and this also makes it trivial to delete Anaconda at a later date (just delete that folder). +.. _install.miniconda: + Installing pandas with Miniconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -173,47 +177,8 @@ Installing using your Linux distribution's package manager. Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -.. note:: - - Installing from the git repository requires a recent installation of `Cython - `__ as the cythonized C sources are no longer checked - into source control. Released source distributions will contain the built C - files. I recommend installing the latest Cython via ``easy_install -U - Cython`` - -The source code is hosted at http://github.com/pydata/pandas, it can be checked -out using git and compiled / installed like so: - -:: - - git clone git://github.com/pydata/pandas.git - cd pandas - python setup.py install - -Make sure you have Cython installed when installing from the repository, -rather then a tarball or pypi. -On Windows, I suggest installing the MinGW compiler suite following the -directions linked to above. Once configured property, run the following on the -command line: - -:: - - python setup.py build --compiler=mingw32 - python setup.py install - -Note that you will not be able to import pandas if you open an interpreter in -the source directory unless you build the C extensions in place: - -:: - - python setup.py build_ext --inplace - -The most recent version of MinGW (any installer dated after 2011-08-03) -has removed the '-mno-cygwin' option but Distutils has not yet been updated to -reflect that. Thus, you may run into an error like "unrecognized command line -option '-mno-cygwin'". Until the bug is fixed in Distutils, you may need to -install a slightly older version of MinGW (2011-08-02 installer). +See the :ref:`contributing documentation ` for complete instructions on building from the git source tree. Further, see :ref:`creating a devevlopment environment ` if you wish to create a *pandas* development environment. Running the test suite ~~~~~~~~~~~~~~~~~~~~~~ @@ -278,7 +243,7 @@ Optional Dependencies * `Cython `__: Only necessary to build development version. Version 0.19.1 or higher. * `SciPy `__: miscellaneous statistical functions -* `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required. +* `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.0 or higher highly recommended. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. * `matplotlib `__: for plotting * `statsmodels `__ @@ -290,6 +255,7 @@ Optional Dependencies * Alternative Excel writer. * `boto `__: necessary for Amazon S3 access. +* `blosc `__: for msgpack compression using ``blosc`` * One of `PyQt4 `__, `PySide `__, `pygtk @@ -354,4 +320,3 @@ Optional Dependencies work. Hence, it is highly recommended that you install these. A packaged distribution like `Enthought Canopy `__ may be worth considering. - diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 9418ca5265f1a..17be04cd64d27 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -94,4 +94,155 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +.. _ref-subclassing-pandas: + +Subclassing pandas Data Structures +---------------------------------- + +.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. + + 1. Monkey-patching: See :ref:`Adding Features to your pandas Installation `. + + 2. Use *composition*. See `here `_. + +This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention: + +1. Override constructor properties. +2. Define original properties + +.. note:: You can find a nice example in `geopandas `_ project. + +Override Constructor Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. + +There are 3 constructors to be defined: + +- ``_constructor``: Used when a manipulation result has the same dimesions as the original. +- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. + +Following table shows how ``pandas`` data structures define constructor properties by default. + +=========================== ======================= =================== ======================= +Property Attributes ``Series`` ``DataFrame`` ``Panel`` +=========================== ======================= =================== ======================= +``_constructor`` ``Series`` ``DataFrame`` ``Panel`` +``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame`` +``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` +=========================== ======================= =================== ======================= + +Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. + +.. code-block:: python + + class SubclassedSeries(Series): + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + +.. code-block:: python + + >>> s = SubclassedSeries([1, 2, 3]) + >>> type(s) + + + >>> to_framed = s.to_frame() + >>> type(to_framed) + + + >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> type(df) + + + >>> sliced1 = df[['A', 'B']] + >>> sliced1 + A B + 0 1 4 + 1 2 5 + 2 3 6 + >>> type(sliced1) + + + >>> sliced2 = df['A'] + >>> sliced2 + 0 1 + 1 2 + 2 3 + Name: A, dtype: int64 + >>> type(sliced2) + + +Define Original Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To let original data structures have additional properties, you should let ``pandas`` knows what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: + +1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. +2. Define ``_metadata`` for normal properties which will be passed to manipulation results. + +Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property + +.. code-block:: python + + class SubclassedDataFrame2(DataFrame): + + # temporary properties + _internal_names = DataFrame._internal_names + ['internal_cache'] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ['added_property'] + + @property + def _constructor(self): + return SubclassedDataFrame2 + +.. code-block:: python + + >>> df = SubclassedDataFrame2({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.internal_cache = 'cached' + >>> df.added_property = 'property' + + >>> df.internal_cache + cached + >>> df.added_property + property + + # properties defined in _internal_names is reset after manipulation + >>> df[['A', 'B']].internal_cache + AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + + # properties defined in _metadata are retained + >>> df[['A', 'B']].added_property + property + diff --git a/doc/source/io.rst b/doc/source/io.rst index 1c8a1159ab162..73a2f2f1d3531 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -89,6 +89,8 @@ They can take a number of arguments: - ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file (much faster than using a regular expression) - ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly. + Set to ``'infer'`` (the default) to guess a format based on the file + extension. - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more ways to specify the file format - ``dtype``: A data type name or a dict of column name to data type. If not @@ -2362,6 +2364,10 @@ for some advanced strategies As of version 0.15.0, pandas requires ``PyTables`` >= 3.0.0. Stores written with prior versions of pandas / ``PyTables`` >= 2.3 are fully compatible (this was the previous minimum ``PyTables`` required version). +.. warning:: + + There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version. + .. ipython:: python :suppress: :okexcept: @@ -3994,6 +4000,24 @@ whether imported ``Categorical`` variables are ordered. a ``Categorial`` with string categories for the values that are labeled and numeric categories for values with no label. +.. _io.other: + +Other file formats +------------------ + +pandas itself only supports IO with a limited set of file formats that map +cleanly to its tabular data model. For reading and writing other file formats +into and from pandas, we recommend these packages from the broader community. + +netCDF +~~~~~~ + +xray_ provides data structures inspired by the pandas DataFrame for working +with multi-dimensional datasets, with a focus on the netCDF file format and +easy conversion to and from pandas. + +.. _xray: http://xray.readthedocs.org/ + .. _io.perf: Performance Considerations diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7128e2dd82d6c..d51c2f62b8a0c 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -12,6 +12,12 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + import pandas.util.doctools as doctools + p = doctools.TablePlotter() + + **************************** Merge, join, and concatenate **************************** @@ -37,14 +43,34 @@ a simple example: .. ipython:: python - df = DataFrame(np.random.randn(10, 4)) - df + df1 = DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=[0, 1, 2, 3]) + + df2 = DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], + 'B': ['B4', 'B5', 'B6', 'B7'], + 'C': ['C4', 'C5', 'C6', 'C7'], + 'D': ['D4', 'D5', 'D6', 'D7']}, + index=[4, 5, 6, 7]) + + df3 = DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], + 'B': ['B8', 'B9', 'B10', 'B11'], + 'C': ['C8', 'C9', 'C10', 'C11'], + 'D': ['D8', 'D9', 'D10', 'D11']}, + index=[8, 9, 10, 11]) + + frames = [df1, df2, df3] + result = concat(frames) - # break it into pieces - pieces = [df[:3], df[3:7], df[7:]] +.. ipython:: python + :suppress: - concatenated = concat(pieces) - concatenated + @savefig merging_concat_basic.png + p.plot(frames, result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` takes a list or dict of homogeneously-typed objects and concatenates them with @@ -86,8 +112,15 @@ this using the ``keys`` argument: .. ipython:: python - concatenated = concat(pieces, keys=['first', 'second', 'third']) - concatenated + result = concat(frames, keys=['x', 'y', 'z']) + +.. ipython:: python + :suppress: + + @savefig merging_concat_keys.png + p.plot(frames, result, + labels=['df1', 'df2', 'df3'], vertical=True) + plt.close('all'); As you can see (if you've read the rest of the documentation), the resulting object's index has a :ref:`hierarchical index `. This @@ -95,7 +128,7 @@ means that we can now do stuff like select out each chunk by key: .. ipython:: python - concatenated.ix['second'] + result.ix['y'] It's not a stretch to see how this can be very useful. More detail on this functionality below. @@ -130,29 +163,50 @@ behavior: .. ipython:: python - from pandas.util.testing import rands_array - df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'], - index=rands_array(5, 10)) - df + df4 = DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], + 'D': ['D2', 'D3', 'D6', 'D7'], + 'F': ['F2', 'F3', 'F6', 'F7']}, + index=[2, 3, 6, 7]) + result = concat([df1, df4], axis=1) + + +.. ipython:: python + :suppress: - concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], - df.ix[-7:, ['d']]], axis=1) + @savefig merging_concat_axis1.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=False); + plt.close('all'); Note that the row indexes have been unioned and sorted. Here is the same thing with ``join='inner'``: .. ipython:: python - concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], - df.ix[-7:, ['d']]], axis=1, join='inner') + result = concat([df1, df4], axis=1, join='inner') + +.. ipython:: python + :suppress: + + @savefig merging_concat_axis1_inner.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=False); + plt.close('all'); Lastly, suppose we just wanted to reuse the *exact index* from the original DataFrame: .. ipython:: python - concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], - df.ix[-7:, ['d']]], axis=1, join_axes=[df.index]) + result = concat([df1, df4], axis=1, join_axes=[df1.index]) + +.. ipython:: python + :suppress: + + @savefig merging_concat_axis1_join_axes.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=False); + plt.close('all'); .. _merging.concatenation: @@ -165,32 +219,44 @@ along ``axis=0``, namely the index: .. ipython:: python - s = Series(randn(10), index=np.arange(10)) - s1 = s[:5] # note we're slicing with labels here, so 5 is included - s2 = s[6:] - s1.append(s2) + result = df1.append(df2) + +.. ipython:: python + :suppress: + + @savefig merging_append1.png + p.plot([df1, df2], result, + labels=['df1', 'df2'], vertical=True); + plt.close('all'); In the case of DataFrame, the indexes must be disjoint but the columns do not need to be: .. ipython:: python - df = DataFrame(randn(6, 4), index=date_range('1/1/2000', periods=6), - columns=['A', 'B', 'C', 'D']) - df1 = df.ix[:3] - df2 = df.ix[3:, :3] - df1 - df2 - df1.append(df2) + result = df1.append(df4) + +.. ipython:: python + :suppress: + + @savefig merging_append2.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=True); + plt.close('all'); ``append`` may take multiple objects to concatenate: .. ipython:: python - df1 = df.ix[:2] - df2 = df.ix[2:4] - df3 = df.ix[4:] - df1.append([df2,df3]) + result = df1.append([df2, df3]) + +.. ipython:: python + :suppress: + + @savefig merging_append3.png + p.plot([df1, df2, df3], result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); .. note:: @@ -205,25 +271,33 @@ Ignoring indexes on the concatenation axis For DataFrames which don't have a meaningful index, you may wish to append them and ignore the fact that they may have overlapping indexes: -.. ipython:: python - - df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) - df2 = DataFrame(randn(3, 4), columns=['A', 'B', 'C', 'D']) +To do this, use the ``ignore_index`` argument: - df1 - df2 +.. ipython:: python -To do this, use the ``ignore_index`` argument: + result = concat([df1, df4], ignore_index=True) .. ipython:: python + :suppress: - concat([df1, df2], ignore_index=True) + @savefig merging_concat_ignore_index.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=True); + plt.close('all'); This is also a valid argument to ``DataFrame.append``: .. ipython:: python - df1.append(df2, ignore_index=True) + result = df1.append(df4, ignore_index=True) + +.. ipython:: python + :suppress: + + @savefig merging_append_ignore_index.png + p.plot([df1, df4], result, + labels=['df1', 'df4'], vertical=True); + plt.close('all'); .. _merging.mixed_ndims: @@ -236,22 +310,45 @@ the name of the Series. .. ipython:: python - df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) - s1 = Series(randn(6), name='foo') - concat([df1, s1],axis=1) + s1 = Series(['X0', 'X1', 'X2', 'X3'], name='X') + result = concat([df1, s1], axis=1) + +.. ipython:: python + :suppress: + + @savefig merging_concat_mixed_ndim.png + p.plot([df1, s1], result, + labels=['df1', 's1'], vertical=False); + plt.close('all'); If unnamed Series are passed they will be numbered consecutively. .. ipython:: python - s2 = Series(randn(6)) - concat([df1, s2, s2, s2],axis=1) + s2 = Series(['_0', '_1', '_2', '_3']) + result = concat([df1, s2, s2, s2], axis=1) + +.. ipython:: python + :suppress: + + @savefig merging_concat_unnamed_series.png + p.plot([df1, s2], result, + labels=['df1', 's2'], vertical=False); + plt.close('all'); Passing ``ignore_index=True`` will drop all name references. .. ipython:: python - concat([df1, s1],axis=1,ignore_index=True) + result = concat([df1, s1], axis=1, ignore_index=True) + +.. ipython:: python + :suppress: + + @savefig merging_concat_series_ignore_index.png + p.plot([df1, s1], result, + labels=['df1', 's1'], vertical=False); + plt.close('all'); More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -260,43 +357,71 @@ Let's consider a variation on the first example presented: .. ipython:: python - df = DataFrame(np.random.randn(10, 4)) - df + result = concat(frames, keys=['x', 'y', 'z']) - # break it into pieces - pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] +.. ipython:: python + :suppress: - result = concat(pieces, axis=1, keys=['one', 'two', 'three']) - result + @savefig merging_concat_group_keys2.png + p.plot(frames, result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); You can also pass a dict to ``concat`` in which case the dict keys will be used for the ``keys`` argument (unless other keys are specified): .. ipython:: python - pieces = {'one': df.ix[:, [0, 1]], - 'two': df.ix[:, [2]], - 'three': df.ix[:, [3]]} - concat(pieces, axis=1) - concat(pieces, keys=['three', 'two']) + pieces = {'x': df1, 'y': df2, 'z': df3} + result = concat(pieces) + +.. ipython:: python + :suppress: + + @savefig merging_concat_dict.png + p.plot([df1, df2, df3], result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); + +.. ipython:: python + + result = concat(pieces, keys=['z', 'y']) + +.. ipython:: python + :suppress: + + @savefig merging_concat_dict_keys.png + p.plot([df1, df2, df3], result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); The MultiIndex created has levels that are constructed from the passed keys and -the columns of the DataFrame pieces: +the index of the DataFrame pieces: .. ipython:: python - result.columns.levels + result.index.levels If you wish to specify other levels (as will occasionally be the case), you can do so using the ``levels`` argument: .. ipython:: python - result = concat(pieces, axis=1, keys=['one', 'two', 'three'], - levels=[['three', 'two', 'one', 'zero']], + result = concat(pieces, keys=['x', 'y', 'z'], + levels=[['z', 'y', 'x', 'w']], names=['group_key']) - result - result.columns.levels + +.. ipython:: python + :suppress: + + @savefig merging_concat_dict_keys_names.png + p.plot([df1, df2, df3], result, + labels=['df1', 'df2', 'df3'], vertical=True); + plt.close('all'); + +.. ipython:: python + + result.index.levels Yes, this is fairly esoteric, but is actually necessary for implementing things like GroupBy where the order of a categorical variable is meaningful. @@ -312,10 +437,16 @@ which returns a new DataFrame as above. .. ipython:: python - df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) - df - s = df.xs(3) - df.append(s, ignore_index=True) + s2 = Series(['X0', 'X1', 'X2', 'X3'], index=['A', 'B', 'C', 'D']) + result = df1.append(s2, ignore_index=True) + +.. ipython:: python + :suppress: + + @savefig merging_append_series_as_row.png + p.plot([df1, s2], result, + labels=['df1', 's2'], vertical=True); + plt.close('all'); You should use ``ignore_index`` with this method to instruct DataFrame to discard its index. If you wish to preserve the index, you should construct an @@ -325,12 +456,17 @@ You can also pass a list of dicts or Series: .. ipython:: python - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) - dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, - {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] - result = df.append(dicts, ignore_index=True) - result + dicts = [{'A': 1, 'B': 2, 'C': 3, 'X': 4}, + {'A': 5, 'B': 6, 'C': 7, 'Y': 8}] + result = df1.append(dicts, ignore_index=True) + +.. ipython:: python + :suppress: + + @savefig merging_append_dits.png + p.plot([df1, DataFrame(dicts)], result, + labels=['df1', 'dicts'], vertical=True); + plt.close('all'); .. _merging.join: @@ -354,7 +490,7 @@ standard database join operations between DataFrame objects: :: - merge(left, right, how='left', on=None, left_on=None, right_on=None, + merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True) @@ -430,24 +566,46 @@ key combination: .. ipython:: python - left = DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) - right = DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) - left - right - merge(left, right, on='key') + left = DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], + 'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3']}) + + right = DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], + 'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}) + result = merge(left, right, on='key') + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); Here is a more complicated example with multiple join keys: .. ipython:: python - left = DataFrame({'key1': ['foo', 'foo', 'bar'], - 'key2': ['one', 'two', 'one'], - 'lval': [1, 2, 3]}) - right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], - 'key2': ['one', 'one', 'one', 'two'], - 'rval': [4, 5, 6, 7]}) - merge(left, right, how='outer') - merge(left, right, how='inner') + left = DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], + 'key2': ['K0', 'K1', 'K0', 'K1'], + 'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3']}) + + right = DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], + 'key2': ['K0', 'K0', 'K0', 'K0'], + 'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}) + + result = merge(left, right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_multiple.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); The ``how`` argument to ``merge`` specifies how to determine which keys are to be included in the resulting table. If a key combination **does not appear** in @@ -463,6 +621,53 @@ either the left or right tables, the values in the joined table will be ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames ``inner``, ``INNER JOIN``, Use intersection of keys from both frames +.. ipython:: python + + result = merge(left, right, how='left', on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_left.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = merge(left, right, how='right', on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_right.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + +.. ipython:: python + + result = merge(left, right, how='outer', on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_outer.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = merge(left, right, how='inner', on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_inner.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + .. _merging.join.index: Joining on index @@ -474,14 +679,47 @@ is a very basic example: .. ipython:: python - df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) - df1 = df.ix[1:, ['A', 'B']] - df2 = df.ix[:5, ['C', 'D']] - df1 - df2 - df1.join(df2) - df1.join(df2, how='outer') - df1.join(df2, how='inner') + left = DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=['K0', 'K1', 'K2']) + + right = DataFrame({'C': ['C0', 'C2', 'C3'], + 'D': ['D0', 'D2', 'D3']}, + index=['K0', 'K2', 'K3']) + + result = left.join(right) + +.. ipython:: python + :suppress: + + @savefig merging_join.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = left.join(right, how='outer') + +.. ipython:: python + :suppress: + + @savefig merging_join_outer.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = left.join(right, how='inner') + +.. ipython:: python + :suppress: + + @savefig merging_join_inner.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); The data alignment here is on the indexes (row labels). This same behavior can be achieved using ``merge`` plus additional arguments instructing it to use the @@ -489,7 +727,27 @@ indexes: .. ipython:: python - merge(df1, df2, left_index=True, right_index=True, how='outer') + result = merge(left, right, left_index=True, right_index=True, how='outer') + +.. ipython:: python + :suppress: + + @savefig merging_merge_index_outer.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = merge(left, right, left_index=True, right_index=True, how='inner'); + +.. ipython:: python + :suppress: + + @savefig merging_merge_index_inner.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); Joining key columns on an index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -511,14 +769,36 @@ key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - df['key'] = ['foo', 'bar'] * 4 - to_join = DataFrame(randn(2, 2), index=['bar', 'foo'], - columns=['j1', 'j2']) - df - to_join - df.join(to_join, on='key') - merge(df, to_join, left_on='key', right_index=True, - how='left', sort=False) + left = DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key': ['K0', 'K1', 'K0', 'K1']}) + + right = DataFrame({'C': ['C0', 'C1'], + 'D': ['D0', 'D1']}, + index=['K0', 'K1']) + + result = left.join(right, on='key') + +.. ipython:: python + :suppress: + + @savefig merging_join_key_columns.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. ipython:: python + + result = merge(left, right, left_on='key', right_index=True, + how='left', sort=False); + +.. ipython:: python + :suppress: + + @savefig merging_merge_key_columns.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); .. _merging.multikey_join: @@ -526,31 +806,30 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: .. ipython:: python - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + left = DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key1': ['K0', 'K0', 'K1', 'K2'], + 'key2': ['K0', 'K1', 'K0', 'K1']}) - data = np.random.randn(len(key1)) - data = DataFrame({'key1' : key1, 'key2' : key2, - 'data' : data}) - data - to_join + index = MultiIndex.from_tuples([('K0', 'K0'), ('K1', 'K0'), + ('K2', 'K0'), ('K2', 'K1')]) + right = DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index) Now this can be joined by passing the two key column names: .. ipython:: python - data.join(to_join, on=['key1', 'key2']) + result = left.join(right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merging_join_multikeys.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); .. _merging.df_inner_join: @@ -561,10 +840,92 @@ easily performed: .. ipython:: python - data.join(to_join, on=['key1', 'key2'], how='inner') + result = left.join(right, on=['key1', 'key2'], how='inner') + +.. ipython:: python + :suppress: + + @savefig merging_join_multikeys_inner.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); As you can see, this drops any rows where there was no match. +.. _merging.join_on_mi: + +Joining a single Index to a Multi-index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +You can join a singly-indexed ``DataFrame`` with a level of a multi-indexed ``DataFrame``. +The level will match on the name of the index of the singly-indexed frame against +a level name of the multi-indexed frame. + +.. ipython:: python + + left = DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=Index(['K0', 'K1', 'K2'], name='key')) + + index = MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + right = DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index) + + result = left.join(right, how='inner') + +.. ipython:: python + :suppress: + + @savefig merging_join_multiindex_inner.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +This is equivalent but less verbose and more memory efficient / faster than this. + +.. ipython:: python + + result = merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key','Y']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_multiindex_alternative.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +Joining with two multi-indexes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is not Implemented via ``join`` at-the-moment, however it can be done using the following. + +.. ipython:: python + + index = MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + left = DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index) + + result = merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key','X','Y']) + +.. ipython:: python + :suppress: + + @savefig merging_merge_two_multiindex.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -574,38 +935,47 @@ columns: .. ipython:: python - left = DataFrame({'key': ['foo', 'foo'], 'value': [1, 2]}) - right = DataFrame({'key': ['foo', 'foo'], 'value': [4, 5]}) - merge(left, right, on='key', suffixes=['_left', '_right']) + left = DataFrame({'k': ['K0', 'K1', 'K2'], 'v': [1, 2, 3]}) + right = DataFrame({'k': ['K0', 'K0', 'K3'], 'v': [4, 5, 6]}) -``DataFrame.join`` has ``lsuffix`` and ``rsuffix`` arguments which behave -similarly. + result = merge(left, right, on='k') -.. _merging.ordered_merge: +.. ipython:: python + :suppress: -Merging Ordered Data -~~~~~~~~~~~~~~~~~~~~ + @savefig merging_merge_overlapped.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); -New in v0.8.0 is the ordered_merge function for combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: +.. ipython:: python + + result = merge(left, right, on='k', suffixes=['_l', '_r']) .. ipython:: python :suppress: - A = DataFrame({'key' : ['a', 'c', 'e'] * 2, - 'lvalue' : [1, 2, 3] * 2, - 'group' : ['a', 'a', 'a', 'b', 'b', 'b']}) - B = DataFrame({'key' : ['b', 'c', 'd'], - 'rvalue' : [1, 2, 3]}) + @savefig merging_merge_overlapped_suffix.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +``DataFrame.join`` has ``lsuffix`` and ``rsuffix`` arguments which behave +similarly. .. ipython:: python - A + left = left.set_index('k') + right = right.set_index('k') + result = left.join(right, lsuffix='_l', rsuffix='_r') - B +.. ipython:: python + :suppress: - ordered_merge(A, B, fill_method='ffill', left_by='group') + @savefig merging_merge_overlapped_multi_suffix.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); .. _merging.multiple_join: @@ -617,11 +987,44 @@ them together on their indexes. The same is true for ``Panel.join``. .. ipython:: python - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - df1 - df1.join([df2, df3]) + right2 = DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2']) + result = left.join([right, right2]) + +.. ipython:: python + :suppress: + + @savefig merging_join_multi_df.png + p.plot([left, right, right2], result, + labels=['left', 'right', 'right2'], vertical=False); + plt.close('all'); + +.. _merging.ordered_merge: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +New in v0.8.0 is the ordered_merge function for combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + + left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], + 'lv': [1, 2, 3, 4], + 's': ['a', 'b', 'c', 'd']}) + + right = DataFrame({'k': ['K1', 'K2', 'K4'], + 'rv': [1, 2, 3]}) + + result = ordered_merge(left, right, fill_method='ffill', left_by='s') + +.. ipython:: python + :suppress: + + @savefig merging_ordered_merge.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=True); + plt.close('all'); .. _merging.combine_first.update: @@ -643,87 +1046,33 @@ For this, use the ``combine_first`` method: .. ipython:: python - df1.combine_first(df2) + result = df1.combine_first(df2) + +.. ipython:: python + :suppress: + + @savefig merging_combine_first.png + p.plot([df1, df2], result, + labels=['df1', 'df2'], vertical=False); + plt.close('all'); Note that this method only takes values from the right DataFrame if they are missing in the left DataFrame. A related method, ``update``, alters non-NA values inplace: .. ipython:: python + :suppress: - df1.update(df2) - df1 - -.. _merging.on_mi: - -Merging with Multi-indexes --------------------------- - -.. _merging.join_on_mi: - -Joining a single Index to a Multi-index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 0.14.0 - -You can join a singly-indexed DataFrame with a level of a multi-indexed DataFrame. -The level will match on the name of the index of the singly-indexed frame against -a level name of the multi-indexed frame. - -.. ipython:: python - - household = DataFrame(dict(household_id = [1,2,3], - male = [0,1,0], - wealth = [196087.3,316478.7,294750]), - columns = ['household_id','male','wealth'] - ).set_index('household_id') - household - portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], - asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29", - "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], - name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell", - "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], - share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), - columns = ['household_id','asset_id','name','share'] - ).set_index(['household_id','asset_id']) - portfolio - - household.join(portfolio, how='inner') - -This is equivalent but less verbose and more memory efficient / faster than this. - -.. code-block:: python - - merge(household.reset_index(), - portfolio.reset_index(), - on=['household_id'], - how='inner' - ).set_index(['household_id','asset_id']) - -Joining with two multi-indexes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is not Implemented via ``join`` at-the-moment, however it can be done using the following. + df1_copy = df1.copy() .. ipython:: python - household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], - asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29", - "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], - share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), - columns = ['household_id','asset_id','share'] - ).set_index(['household_id','asset_id']) - household + df1.update(df2) - log_return = DataFrame(dict(asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237"], - t = [233, 234, 235, 180, 181], - log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]), - ).set_index(["asset_id","t"]) - log_return +.. ipython:: python + :suppress: - merge(household.reset_index(), - log_return.reset_index(), - on=['asset_id'], - how='inner' - ).set_index(['household_id','asset_id','t']) + @savefig merging_update.png + p.plot([df1_copy, df2], df1, + labels=['df1', 'df2'], vertical=False); + plt.close('all'); diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 4505d256d31f6..04a6302f958a2 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -1,11 +1,19 @@ .. currentmodule:: pandas -.. _missing_data: .. ipython:: python :suppress: - from pandas import * - options.display.max_rows=15 + import numpy as np + import pandas as pd + pd.options.display.max_rows=15 + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + pd.options.display.mpl_style = 'default' + import matplotlib.pyplot as plt + +.. _missing_data: ************************* Working with missing data @@ -14,14 +22,6 @@ Working with missing data In this section, we will discuss missing (also referred to as NA) values in pandas. -.. ipython:: python - :suppress: - - import numpy as np; randn = np.random.randn; randint =np.random.randint - from pandas import * - import matplotlib.pyplot as plt - from pandas.compat import lrange - .. note:: The choice of using ``NaN`` internally to denote missing data was largely @@ -50,8 +50,8 @@ a data set is by reindexing. For example .. ipython:: python - df = DataFrame(randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], - columns=['one', 'two', 'three']) + df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], + columns=['one', 'two', 'three']) df['four'] = 'bar' df['five'] = df['one'] > 0 df @@ -118,7 +118,7 @@ the missing value type chosen: .. ipython:: python - s = Series([1, 2, 3]) + s = pd.Series([1, 2, 3]) s.loc[0] = None s @@ -128,7 +128,7 @@ For object containers, pandas will use the value given: .. ipython:: python - s = Series(["a", "b", "c"]) + s = pd.Series(["a", "b", "c"]) s.loc[0] = None s.loc[1] = np.nan s @@ -255,7 +255,7 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = DataFrame(np.random.randn(10,3),columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10,3),columns=list('ABC')) dff.iloc[3:5,0] = np.nan dff.iloc[4:6,1] = np.nan dff.iloc[5:8,2] = np.nan @@ -307,7 +307,7 @@ Interpolation .. versionadded:: 0.13.0 :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have - revamped interpolation methods and functionaility. + revamped interpolation methods and functionality. Both Series and Dataframe objects have an ``interpolate`` method that, by default, performs linear interpolation at missing datapoints. @@ -317,7 +317,7 @@ performs linear interpolation at missing datapoints. np.random.seed(123456) idx = date_range('1/1/2000', periods=100, freq='BM') - ts = Series(randn(100), index=idx) + ts = pd.Series(np.random.randn(100), index=idx) ts[1:20] = np.nan ts[60:80] = np.nan ts = ts.cumsum() @@ -328,7 +328,6 @@ performs linear interpolation at missing datapoints. ts.count() ts.interpolate().count() - plt.figure() @savefig series_interpolate.png ts.interpolate().plot() @@ -351,7 +350,7 @@ For a floating-point index, use ``method='values'``: :suppress: idx = [0., 1., 10.] - ser = Series([0., np.nan, 10.], idx) + ser = pd.Series([0., np.nan, 10.], idx) .. ipython:: python @@ -363,7 +362,7 @@ You can also interpolate with a DataFrame: .. ipython:: python - df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) df df.interpolate() @@ -401,13 +400,12 @@ Compare several methods: np.random.seed(2) - ser = Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) + ser = pd.Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) ser[bad] = np.nan methods = ['linear', 'quadratic', 'cubic'] - df = DataFrame({m: ser.interpolate(method=m) for m in methods}) - plt.figure() + df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods}) @savefig compare_interpolations.png df.plot() @@ -419,7 +417,7 @@ at the new values. .. ipython:: python - ser = Series(np.sort(np.random.uniform(size=100))) + ser = pd.Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index new_index = ser.index | Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) @@ -438,7 +436,7 @@ observation: .. ipython:: python - ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser = pd.Series([1, 3, np.nan, np.nan, np.nan, 11]) ser.interpolate(limit=2) .. _missing_data.replace: @@ -454,7 +452,7 @@ value: .. ipython:: python - ser = Series([0., 1., 2., 3., 4.]) + ser = pd.Series([0., 1., 2., 3., 4.]) ser.replace(0, 5) @@ -474,7 +472,7 @@ For a DataFrame, you can specify individual values by column: .. ipython:: python - df = DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) df.replace({'a': 0, 'b': 5}, 100) @@ -502,31 +500,24 @@ String/Regular Expression Replacement Replace the '.' with ``nan`` (str -> str) -.. ipython:: python - :suppress: - - from numpy.random import rand, randn - from numpy import nan - from pandas import DataFrame - .. ipython:: python - d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} - df = DataFrame(d) - df.replace('.', nan) + d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} + df = pd.DataFrame(d) + df.replace('.', np.nan) Now do it with a regular expression that removes surrounding whitespace (regex -> regex) .. ipython:: python - df.replace(r'\s*\.\s*', nan, regex=True) + df.replace(r'\s*\.\s*', np.nan, regex=True) Replace a few different values (list -> list) .. ipython:: python - df.replace(['a', '.'], ['b', nan]) + df.replace(['a', '.'], ['b', np.nan]) list of regex -> list of regex @@ -538,14 +529,14 @@ Only search in column ``'b'`` (dict -> dict) .. ipython:: python - df.replace({'b': '.'}, {'b': nan}) + df.replace({'b': '.'}, {'b': np.nan}) Same as the previous example, but use a regular expression for searching instead (dict of regex -> dict) .. ipython:: python - df.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + df.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) You can pass nested dictionaries of regular expressions that use ``regex=True`` @@ -557,7 +548,7 @@ or you can pass the nested dictionary like so .. ipython:: python - df.replace(regex={'b': {r'\s*\.\s*': nan}}) + df.replace(regex={'b': {r'\s*\.\s*': np.nan}}) You can also use the group of a regular expression match when replacing (dict of regex -> dict of regex), this works for lists as well @@ -571,7 +562,7 @@ will be replaced with a scalar (list of regex -> regex) .. ipython:: python - df.replace([r'\s*\.\s*', r'a|b'], nan, regex=True) + df.replace([r'\s*\.\s*', r'a|b'], np.nan, regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -580,7 +571,7 @@ dictionary. The previous example, in this case, would then be .. ipython:: python - df.replace(regex=[r'\s*\.\s*', r'a|b'], value=nan) + df.replace(regex=[r'\s*\.\s*', r'a|b'], value=np.nan) This can be convenient if you do not want to pass ``regex=True`` every time you want to use a regular expression. @@ -595,33 +586,25 @@ Numeric Replacement Similar to ``DataFrame.fillna`` -.. ipython:: python - :suppress: - - from numpy.random import rand, randn - from numpy import nan - from pandas import DataFrame - from pandas.util.testing import assert_frame_equal - .. ipython:: python - df = DataFrame(randn(10, 2)) - df[rand(df.shape[0]) > 0.5] = 1.5 - df.replace(1.5, nan) + df = pd.DataFrame(np.random.randn(10, 2)) + df[np.random.rand(df.shape[0]) > 0.5] = 1.5 + df.replace(1.5, np.nan) Replacing more than one value via lists works as well .. ipython:: python df00 = df.values[0, 0] - df.replace([1.5, df00], [nan, 'a']) + df.replace([1.5, df00], [np.nan, 'a']) df[1].dtype You can also operate on the DataFrame in place .. ipython:: python - df.replace(1.5, nan, inplace=True) + df.replace(1.5, np.nan, inplace=True) .. warning:: @@ -631,7 +614,7 @@ You can also operate on the DataFrame in place .. code-block:: python - s = Series([True, False, True]) + s = pd.Series([True, False, True]) s.replace({'a string': 'new value', True: False}) # raises TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' @@ -643,7 +626,7 @@ You can also operate on the DataFrame in place .. ipython:: python - s = Series([True, False, True]) + s = pd.Series([True, False, True]) s.replace('a string', 'another string') the original ``NDFrame`` object will be returned untouched. We're working on @@ -672,7 +655,7 @@ For example: .. ipython:: python - s = Series(randn(5), index=[0, 2, 4, 6, 7]) + s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7]) s > 0 (s > 0).dtype crit = (s > 0).reindex(list(range(8))) diff --git a/doc/source/options.rst b/doc/source/options.rst index 7e36f369bc7e7..4b69015353612 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -18,7 +18,7 @@ Overview pandas has an options system that lets you customize some aspects of its behaviour, display-related options being those the user is most likely to adjust. -Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``), +Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). You can get/set options directly as attributes of the top-level ``options`` attribute: .. ipython:: python @@ -29,7 +29,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr pd.options.display.max_rows There is also an API composed of 5 relevant functions, available directly from the ``pandas`` -namespace, and they are: +namespace: - :func:`~pandas.get_option` / :func:`~pandas.set_option` - get/set the value of a single option. - :func:`~pandas.reset_option` - reset one or more options to their default value. @@ -412,7 +412,7 @@ mode.use_inf_as_null False True means treat None, NaN, -INF, Number Formatting ------------------ -pandas also allow you to set how numbers are displayed in the console. +pandas also allows you to set how numbers are displayed in the console. This option is not set through the ``set_options`` API. Use the ``set_eng_float_format`` function diff --git a/doc/source/overview.rst b/doc/source/overview.rst index 49a788def2854..b1addddc2121d 100644 --- a/doc/source/overview.rst +++ b/doc/source/overview.rst @@ -9,7 +9,7 @@ Package overview :mod:`pandas` consists of the following things * A set of labeled array data structures, the primary of which are - Series/TimeSeries and DataFrame + Series and DataFrame * Index objects enabling both simple axis indexing and multi-level / hierarchical axis indexing * An integrated group by engine for aggregating and transforming data sets @@ -32,7 +32,6 @@ Data structures at a glance :widths: 15, 20, 50 1, Series, "1D labeled homogeneously-typed array" - 1, TimeSeries, "Series with index containing datetimes" 2, DataFrame, "General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed columns" 3, Panel, "General 3D labeled, also size-mutable array" diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index 826d9e980538e..da37c92c88ecf 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -15,7 +15,69 @@ rpy2 / R interface .. warning:: - In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be removed in a future version**. Similar functionaility can be accessed thru the `rpy2 `_ project. + In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be + removed in a future version**. Similar functionality can be accessed + through the `rpy2 `_ project. + See the :ref:`updating ` section for a guide to port your + code from the ``pandas.rpy`` to ``rpy2`` functions. + + +.. _rpy.updating: + +Updating your code to use rpy2 functions +---------------------------------------- + +In v0.16.0, the ``pandas.rpy`` module has been **deprecated** and users are +pointed to the similar functionality in ``rpy2`` itself (rpy2 >= 2.4). + +Instead of importing ``import pandas.rpy.common as com``, the following imports +should be done to activate the pandas conversion support in rpy2:: + + from rpy2.robjects import pandas2ri + pandas2ri.activate() + +Converting data frames back and forth between rpy2 and pandas should be largely +automated (no need to convert explicitly, it will be done on the fly in most +rpy2 functions). + +To convert explicitly, the functions are ``pandas2ri.py2ri()`` and +``pandas2ri.ri2py()``. So these functions can be used to replace the existing +functions in pandas: + +- ``com.convert_to_r_dataframe(df)`` should be replaced with ``pandas2ri.py2ri(df)`` +- ``com.convert_robj(rdf)`` should be replaced with ``pandas2ri.ri2py(rdf)`` + +Note: these functions are for the latest version (rpy2 2.5.x) and were called +``pandas2ri.pandas2ri()`` and ``pandas2ri.ri2pandas()`` previously. + +Some of the other functionality in `pandas.rpy` can be replaced easily as well. +For example to load R data as done with the ``load_data`` function, the +current method:: + + df_iris = com.load_data('iris') + +can be replaced with:: + + from rpy2.robjects import r + r.data('iris') + df_iris = pandas2ri.ri2py(r[name]) + +The ``convert_to_r_matrix`` function can be replaced by the normal +``pandas2ri.py2ri`` to convert dataframes, with a subsequent call to R +``as.matrix`` function. + +.. warning:: + + Not all conversion functions in rpy2 are working exactly the same as the + current methods in pandas. If you experience problems or limitations in + comparison to the ones in pandas, please report this at the + `issue tracker `_. + +See also the documentation of the `rpy2 `_ project. + + +R interface with rpy2 +--------------------- If your computer has R and rpy2 (> 2.2) installed (which will be left to the reader), you will be able to leverage the below functionality. On Windows, @@ -56,6 +118,7 @@ appropriate pandas object (most likely a DataFrame): .. ipython:: python + :okwarning: import pandas.rpy.common as com infert = com.load_data('infert') diff --git a/doc/source/release.rst b/doc/source/release.rst index 074e686ac1662..f22f95fd0a7d4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -45,6 +45,80 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.16.1 +------------- + +**Release date:** (May 11, 2015) + +This is a minor release from 0.16.0 and includes a large number of bug fixes +along with several new features, enhancements, and performance improvements. +A small number of API changes were necessary to fix existing bugs. + +See the :ref:`v0.16.1 Whatsnew ` overview for an extensive list +of all API changes, enhancements and bugs that have been fixed in 0.16.1. + +Thanks +~~~~~~ + +- Alfonso MHC +- Andy Hayden +- Artemy Kolchinsky +- Chris Gilmer +- Chris Grinolds +- Dan Birken +- David BROCHART +- David Hirschfeld +- David Stephens +- Dr. Leo +- Evan Wright +- Frans van Dunné +- Hatem Nassrat +- Henning Sperr +- Hugo Herter +- Jan Schulz +- Jeff Blackburne +- Jeff Reback +- Jim Crist +- Jonas Abernot +- Joris Van den Bossche +- Kerby Shedden +- Leo Razoumov +- Manuel Riel +- Mortada Mehyar +- Nick Burns +- Nick Eubank +- Olivier Grisel +- Phillip Cloud +- Pietro Battiston +- Roy Hyunjin Han +- Sam Zhang +- Scott Sanderson +- Stephan Hoyer +- Tiago Antao +- Tom Ajamian +- Tom Augspurger +- Tomaz Berisa +- Vikram Shirgur +- Vladimir Filimonov +- William Hogman +- Yasin A +- Younggun Kim +- behzad nouri +- dsm054 +- floydsoft +- flying-sheep +- gfr +- jnmclarty +- jreback +- ksanghai +- lucas +- mschmohl +- ptype +- rockg +- scls19fr +- sinhrks + + pandas 0.16.0 ------------- diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index ac9b6c9aecc4a..1992288fd4d00 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -25,6 +25,24 @@ Remote Data Access ****************** +.. _remote_data.pandas_datareader: + +.. warning:: + + In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable `pandas-datareader package `_. This will allow the data modules to be independently updated to your pandas installation. The API for ``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. (:issue:`8961`) + + You should replace the imports of the following: + + .. code-block:: python + + from pandas.io import data, wb + + With: + + .. code-block:: python + + from pandas_datareader import data, wb + .. _remote_data.data_reader: Functions from :mod:`pandas.io.data` and :mod:`pandas.io.ga` extract data from various Internet sources into a DataFrame. Currently the following sources are supported: @@ -49,7 +67,7 @@ Yahoo! Finance import datetime start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) - f=web.DataReader("F", 'yahoo', start, end) + f = web.DataReader("F", 'yahoo', start, end) f.ix['2010-01-04'] .. _remote_data.yahoo_options: @@ -58,10 +76,10 @@ Yahoo! Finance Options ---------------------- ***Experimental*** -The Options class allows the download of options data from Yahoo! Finance. +The ``Options`` class allows the download of options data from Yahoo! Finance. The ``get_all_data`` method downloads and caches option data for all expiry months -and provides a formatted ``DataFrame`` with a hierarchical index, so its easy to get +and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get to the specific option you want. .. ipython:: python @@ -71,10 +89,10 @@ to the specific option you want. data = aapl.get_all_data() data.iloc[0:5, 0:5] - #Show the $100 strike puts at all expiry dates: + # Show the $100 strike puts at all expiry dates: data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] - #Show the volume traded of $100 strike puts at all expiry dates: + # Show the volume traded of $100 strike puts at all expiry dates: data.loc[(100, slice(None), 'put'),'Vol'].head() If you don't want to download all the data, more specific requests can be made. @@ -121,7 +139,7 @@ Google Finance import datetime start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) - f=web.DataReader("F", 'google', start, end) + f = web.DataReader("F", 'google', start, end) f.ix['2010-01-04'] .. _remote_data.fred: @@ -152,7 +170,7 @@ Dataset names are listed at `Fama/French Data Library .. ipython:: python import pandas.io.data as web - ip=web.DataReader("5_Industry_Portfolios", "famafrench") + ip = web.DataReader("5_Industry_Portfolios", "famafrench") ip[4].ix[192607] .. _remote_data.wb: @@ -168,7 +186,7 @@ Indicators ~~~~~~~~~~ Either from exploring the World Bank site, or using the search function included, -every world bank indicator is accessible. +every world bank indicator is accessible. For example, if you wanted to compare the Gross Domestic Products per capita in constant dollars in North America, you would use the ``search`` function: @@ -287,7 +305,7 @@ Country Codes .. versionadded:: 0.15.1 -The ``country`` argument accepts a string or list of mixed +The ``country`` argument accepts a string or list of mixed `two `__ or `three `__ character ISO country codes, as well as dynamic `World Bank exceptions `__ to the ISO standards. @@ -298,13 +316,12 @@ Problematic Country Codes & Indicators .. note:: - The World Bank's country list and indicators are dynamic. As of 0.15.1, + The World Bank's country list and indicators are dynamic. As of 0.15.1, :func:`wb.download()` is more flexible. To achieve this, the warning and exception logic changed. - -The world bank converts some country codes, -in their response, which makes error checking by pandas difficult. -Retired indicators still persist in the search. + +The world bank converts some country codes in their response, which makes error +checking by pandas difficult. Retired indicators still persist in the search. Given the new flexibility of 0.15.1, improved error handling by the user may be necessary for fringe cases. @@ -321,12 +338,12 @@ There are at least 4 kinds of country codes: There are at least 3 kinds of indicators: 1. Current - Returns data. -2. Retired - Appears in search results, yet won't return data. +2. Retired - Appears in search results, yet won't return data. 3. Bad - Will not return data. Use the ``errors`` argument to control warnings and exceptions. Setting errors to ignore or warn, won't stop failed responses. (ie, 100% bad -indicators, or a single "bad" (#4 above) country code). +indicators, or a single "bad" (#4 above) country code). See docstrings for more info. @@ -377,15 +394,14 @@ The following will fetch users and pageviews (metrics) data per day of the week, filters = "pagePath=~aboutus;ga:country==France", ) -The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We can only strongly recommend you to always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics. +The only mandatory arguments are ``metrics,`` ``dimensions`` and ``start_date``. We strongly recommend that you always specify the ``account_id``, ``profile_id`` and ``property_id`` to avoid accessing the wrong data bucket in Google Analytics. The ``index_col`` argument indicates which dimension(s) has to be taken as index. -The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page has URL has to contain ``aboutus`` AND the visitors country has to be France. +The ``filters`` argument indicates the filtering to apply to the query. In the above example, the page URL has to contain ``aboutus`` AND the visitors country has to be France. -Detailed informations in the followings: +Detailed information in the following: * `pandas & google analytics, by yhat `__ * `Google Analytics integration in pandas, by Chang She `__ * `Google Analytics Dimensions and Metrics Reference `_ - diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index dc13ce3e5c4da..26aaf9c2be69d 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -6,14 +6,9 @@ import numpy as np np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - from pandas.core.reshape import * - import pandas.util.testing as tm - randn = np.random.randn + import pandas as pd + pd.options.display.max_rows=15 np.set_printoptions(precision=4, suppress=True) - from pandas.tools.tile import * - from pandas.compat import zip ************************** Reshaping and Pivot Tables @@ -56,7 +51,7 @@ For the curious here is how the above DataFrame was created: data = {'value' : frame.values.ravel('F'), 'variable' : np.asarray(frame.columns).repeat(N), 'date' : np.tile(np.asarray(frame.index), K)} - return DataFrame(data, columns=['date', 'variable', 'value']) + return pd.DataFrame(data, columns=['date', 'variable', 'value']) df = unpivot(tm.makeTimeDataFrame()) To select out everything for variable ``A`` we could do: @@ -119,11 +114,11 @@ from the hierarchical indexing section: .. ipython:: python tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']])) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] df2 @@ -166,8 +161,8 @@ will result in a **sorted** copy of the original DataFrame or Series: .. ipython:: python - index = MultiIndex.from_product([[2,1], ['a', 'b']]) - df = DataFrame(randn(4), index=index, columns=['A']) + index = pd.MultiIndex.from_product([[2,1], ['a', 'b']]) + df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) df all(df.unstack().stack() == df.sort()) @@ -185,13 +180,13 @@ processed individually. .. ipython:: python - columns = MultiIndex.from_tuples([ + columns = pd.MultiIndex.from_tuples([ ('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short') ], names=['exp', 'animal', 'hair_length'] ) - df = DataFrame(randn(4, 4), columns=columns) + df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df df.stack(level=['animal', 'hair_length']) @@ -215,12 +210,13 @@ calling ``sortlevel``, of course). Here is a more complex example: .. ipython:: python - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')], - names=['exp', 'animal']) - index = MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), ('one', 'two')], - names=['first', 'second']) - df = DataFrame(randn(8, 4), index=index, columns=columns) + columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) + index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), + ('one', 'two')], + names=['first', 'second']) + df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) df2 = df.ix[[0, 1, 2, 4, 5, 7]] df2 @@ -259,13 +255,13 @@ For instance, .. ipython:: python - cheese = DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) cheese - melt(cheese, id_vars=['first', 'last']) - melt(cheese, id_vars=['first', 'last'], var_name='quantity') + pd.melt(cheese, id_vars=['first', 'last']) + pd.melt(cheese, id_vars=['first', 'last'], var_name='quantity') Another way to transform is to use the ``wide_to_long`` panel data convenience function. @@ -324,22 +320,22 @@ Consider a data set like this: .. ipython:: python import datetime - df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, - 'B' : ['A', 'B', 'C'] * 8, - 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D' : np.random.randn(24), - 'E' : np.random.randn(24), - 'F' : [datetime.datetime(2013, i, 1) for i in range(1, 13)] + - [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, + 'B': ['A', 'B', 'C'] * 8, + 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, + 'D': np.random.randn(24), + 'E': np.random.randn(24), + 'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) df We can produce pivot tables from this data very easily: .. ipython:: python - pivot_table(df, values='D', index=['A', 'B'], columns=['C']) - pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pd.pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) The result object is a DataFrame having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -348,20 +344,20 @@ hierarchy in the columns: .. ipython:: python - pivot_table(df, index=['A', 'B'], columns=['C']) + pd.pivot_table(df, index=['A', 'B'], columns=['C']) Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. .. ipython:: python - pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') + pd.pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pivot_table(df, index=['A', 'B'], columns=['C']) + table = pd.pivot_table(df, index=['A', 'B'], columns=['C']) print(table.to_string(na_rep='')) Note that ``pivot_table`` is also available as an instance method on DataFrame. @@ -397,7 +393,7 @@ For example: a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) b = np.array([one, one, two, one, two, one], dtype=object) c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) - crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) .. _reshaping.pivot.margins: @@ -428,14 +424,14 @@ variables: ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) - cut(ages, bins=3) + pd.cut(ages, bins=3) If the ``bins`` keyword is an integer, then equal-width bins are formed. Alternatively we can specify custom bin-edges: .. ipython:: python - cut(ages, bins=[0, 18, 35, 70]) + pd.cut(ages, bins=[0, 18, 35, 70]) .. _reshaping.dummies: @@ -449,17 +445,16 @@ containing ``k`` columns of 1s and 0s: .. ipython:: python - df = DataFrame({'key': list('bbacab'), 'data1': range(6)}) + df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)}) - - get_dummies(df['key']) + pd.get_dummies(df['key']) Sometimes it's useful to prefix the column names, for example when merging the result with the original DataFrame: .. ipython:: python - dummies = get_dummies(df['key'], prefix='key') + dummies = pd.get_dummies(df['key'], prefix='key') dummies @@ -469,14 +464,14 @@ This function is often used along with discretization functions like ``cut``: .. ipython:: python - values = randn(10) + values = np.random.randn(10) values bins = [0, 0.2, 0.4, 0.6, 0.8, 1] - get_dummies(cut(values, bins)) + pd.get_dummies(pd.cut(values, bins)) See also :func:`Series.str.get_dummies `. diff --git a/doc/source/text.rst b/doc/source/text.rst index a98153e277fae..d40445d8490f7 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -17,10 +17,10 @@ Working with Text Data .. _text.string_methods: -Series is equipped with a set of string processing methods +Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are -accessed via the Series's ``str`` attribute and generally have names matching +accessed via the ``str`` attribute and generally have names matching the equivalent (scalar) built-in string methods: .. ipython:: python @@ -30,6 +30,39 @@ the equivalent (scalar) built-in string methods: s.str.upper() s.str.len() +.. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + idx.str.lstrip() + idx.str.rstrip() + +The string methods on Index are especially useful for cleaning up or +transforming DataFrame columns. For instance, you may have columns with +leading or trailing whitespace: + +.. ipython:: python + + df = DataFrame(randn(3, 2), columns=[' Column A ', ' Column B '], + index=range(3)) + df + +Since ``df.columns`` is an Index object, we can use the ``.str`` accessor + +.. ipython:: python + + df.columns.str.strip() + df.columns.str.lower() + +These string methods can then be used to clean up the columns as needed. +Here we are removing leading and trailing whitespaces, lowercasing all names, +and replacing any remaining whitespaces with underscores: + +.. ipython:: python + + df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') + df + Splitting and Replacing Strings ------------------------------- @@ -49,11 +82,11 @@ Elements in the split lists can be accessed using ``get`` or ``[]`` notation: s2.str.split('_').str.get(1) s2.str.split('_').str[1] -Easy to expand this to return a DataFrame using ``return_type``. +Easy to expand this to return a DataFrame using ``expand``. .. ipython:: python - s2.str.split('_', return_type='frame') + s2.str.split('_', expand=True) Methods like ``replace`` and ``findall`` take `regular expressions `__, too: @@ -229,12 +262,18 @@ Method Summary :meth:`~Series.str.strip`,Equivalent to ``str.strip`` :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` :meth:`~Series.str.lower`,Equivalent to ``str.lower`` :meth:`~Series.str.upper`,Equivalent to ``str.upper`` :meth:`~Series.str.find`,Equivalent to ``str.find`` :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` - :meth:`~Series.str.capicalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.index`,Equivalent to ``str.index`` + :meth:`~Series.str.rindex`,Equivalent to ``str.rindex`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` + :meth:`~Series.str.translate`,Equivalent to ``str.translate`` :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` @@ -243,4 +282,4 @@ Method Summary :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isnumeric`,Equivalent to ``str.isdecimal`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 786a46d343be1..8215414e425fe 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -29,13 +29,13 @@ Time Deltas Starting in v0.15.0, we introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. -Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds. +Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, seconds. They can be both positive and negative. Parsing ------- -You can construct a ``Timedelta`` scalar thru various arguments: +You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python @@ -46,7 +46,7 @@ You can construct a ``Timedelta`` scalar thru various arguments: Timedelta('-1 days 2 min 3us') # like datetime.timedelta - # note: these MUST be specified as keyword argments + # note: these MUST be specified as keyword arguments Timedelta(days=1,seconds=1) # integers with a unit @@ -100,7 +100,7 @@ It will construct Series if the input is a Series, a scalar if the input is scal Operations ---------- -You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series thru +You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series through subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python @@ -290,7 +290,7 @@ TimedeltaIndex .. versionadded:: 0.15.0 -To generate an index with time delta, you can use either the TimedeltaIndex or +To generate an index with time delta, you can use either the ``TimedeltaIndex`` or the ``timedelta_range`` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index ac3302ae40fa7..ce1035e91391a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -4,7 +4,7 @@ .. ipython:: python :suppress: - from datetime import datetime, timedelta + from datetime import datetime, timedelta, time import numpy as np np.random.seed(123456) from pandas import * @@ -243,7 +243,7 @@ variety of frequency aliases. The default frequency for ``date_range`` is a rng = bdate_range(start, end) rng -``date_range`` and ``bdate_range`` makes it easy to generate a range of dates +``date_range`` and ``bdate_range`` make it easy to generate a range of dates using various combinations of parameters like ``start``, ``end``, ``periods``, and ``freq``: @@ -353,7 +353,7 @@ This specifies an **exact** stop time (and is not the same as the above) dft['2013-1':'2013-2-28 00:00:00'] -We are stopping on the included end-point as its part of the index +We are stopping on the included end-point as it is part of the index .. ipython:: python @@ -482,6 +482,7 @@ frequency increment. Specific offset logic like "month", "business day", or BYearEnd, "business year end" BYearBegin, "business year begin" FY5253, "retail (aka 52-53 week) year" + BusinessHour, "business hour" Hour, "one hour" Minute, "one minute" Second, "one second" @@ -540,7 +541,7 @@ The ``rollforward`` and ``rollback`` methods do exactly what you would expect: It's definitely worth exploring the ``pandas.tseries.offsets`` module and the various docstrings for the classes. -These operations (``apply``, ``rollforward`` and ``rollback``) preserves time (hour, minute, etc) information by default. To reset time, use ``normalize=True`` keyword when create offset instance. If ``normalize=True``, result is normalized after the function is applied. +These operations (``apply``, ``rollforward`` and ``rollback``) preserves time (hour, minute, etc) information by default. To reset time, use ``normalize=True`` keyword when creating the offset instance. If ``normalize=True``, result is normalized after the function is applied. .. ipython:: python @@ -563,7 +564,7 @@ Parametric offsets ~~~~~~~~~~~~~~~~~~ Some of the offsets can be "parameterized" when created to result in different -behavior. For example, the ``Week`` offset for generating weekly data accepts a +behaviors. For example, the ``Week`` offset for generating weekly data accepts a ``weekday`` parameter which results in the generated dates always lying on a particular day of the week: @@ -667,6 +668,102 @@ in the usual way. have to change to fix the timezone issues, the behaviour of the ``CustomBusinessDay`` class may have to change in future versions. +.. _timeseries.businesshour: + +Business Hour +~~~~~~~~~~~~~ + +The ``BusinessHour`` class provides a business hour representation on ``BusinessDay``, +allowing to use specific start and end times. + +By default, ``BusinessHour`` uses 9:00 - 17:00 as business hours. +Adding ``BusinessHour`` will increment ``Timestamp`` by hourly. +If target ``Timestamp`` is out of business hours, move to the next business hour then increment it. +If the result exceeds the business hours end, remaining is added to the next business day. + +.. ipython:: python + + bh = BusinessHour() + bh + + # 2014-08-01 is Friday + Timestamp('2014-08-01 10:00').weekday() + Timestamp('2014-08-01 10:00') + bh + + # Below example is the same as Timestamp('2014-08-01 09:00') + bh + Timestamp('2014-08-01 08:00') + bh + + # If the results is on the end time, move to the next business day + Timestamp('2014-08-01 16:00') + bh + + # Remainings are added to the next day + Timestamp('2014-08-01 16:30') + bh + + # Adding 2 business hours + Timestamp('2014-08-01 10:00') + BusinessHour(2) + + # Subtracting 3 business hours + Timestamp('2014-08-01 10:00') + BusinessHour(-3) + +Also, you can specify ``start`` and ``end`` time by keywords. +Argument must be ``str`` which has ``hour:minute`` representation or ``datetime.time`` instance. +Specifying seconds, microseconds and nanoseconds as business hour results in ``ValueError``. + +.. ipython:: python + + bh = BusinessHour(start='11:00', end=time(20, 0)) + bh + + Timestamp('2014-08-01 13:00') + bh + Timestamp('2014-08-01 09:00') + bh + Timestamp('2014-08-01 18:00') + bh + +Passing ``start`` time later than ``end`` represents midnight business hour. +In this case, business hour exceeds midnight and overlap to the next day. +Valid business hours are distinguished by whether it started from valid ``BusinessDay``. + +.. ipython:: python + + bh = BusinessHour(start='17:00', end='09:00') + bh + + Timestamp('2014-08-01 17:00') + bh + Timestamp('2014-08-01 23:00') + bh + + # Although 2014-08-02 is Satuaday, + # it is valid because it starts from 08-01 (Friday). + Timestamp('2014-08-02 04:00') + bh + + # Although 2014-08-04 is Monday, + # it is out of business hours because it starts from 08-03 (Sunday). + Timestamp('2014-08-04 04:00') + bh + +Applying ``BusinessHour.rollforward`` and ``rollback`` to out of business hours results in +the next business hour start or previous day's end. Different from other offsets, ``BusinessHour.rollforward`` +may output different results from ``apply`` by definition. + +This is because one day's business hour end is equal to next day's business hour start. For example, +under the default business hours (9:00 - 17:00), there is no gap (0 minutes) between ``2014-08-01 17:00`` and +``2014-08-04 09:00``. + +.. ipython:: python + + # This adjusts a Timestamp to business hour edge + BusinessHour().rollback(Timestamp('2014-08-02 15:00')) + BusinessHour().rollforward(Timestamp('2014-08-02 15:00')) + + # It is the same as BusinessHour().apply(Timestamp('2014-08-01 17:00')). + # And it is the same as BusinessHour().apply(Timestamp('2014-08-04 09:00')) + BusinessHour().apply(Timestamp('2014-08-02 15:00')) + + # BusinessDay results (for reference) + BusinessHour().rollforward(Timestamp('2014-08-02')) + + # It is the same as BusinessDay().apply(Timestamp('2014-08-01')) + # The result is the same as rollworward because BusinessDay never overlap. + BusinessHour().apply(Timestamp('2014-08-02')) + + Offset Aliases ~~~~~~~~~~~~~~ @@ -696,6 +793,7 @@ frequencies. We will refer to these aliases as *offset aliases* "BA", "business year end frequency" "AS", "year start frequency" "BAS", "business year start frequency" + "BH", "business hour frequency" "H", "hourly frequency" "T", "minutely frequency" "S", "secondly frequency" @@ -806,7 +904,7 @@ strongly recommended that you switch to using the new offset aliases. "ms", "L" "us", "U" -As you can see, legacy quarterly and annual frequencies are business quarter +As you can see, legacy quarterly and annual frequencies are business quarters and business year ends. Please also note the legacy time rule for milliseconds ``ms`` versus the new offset alias for month start ``MS``. This means that offset alias parsing is case sensitive. @@ -910,10 +1008,9 @@ Time series-related instance methods Shifting / lagging ~~~~~~~~~~~~~~~~~~ -One may want to *shift* or *lag* the values in a TimeSeries back and forward in +One may want to *shift* or *lag* the values in a time series back and forward in time. The method for this is ``shift``, which is available on all of the pandas -objects. In DataFrame, ``shift`` will currently only shift along the ``index`` -and in Panel along the ``major_axis``. +objects. .. ipython:: python @@ -929,7 +1026,7 @@ The shift method accepts an ``freq`` argument which can accept a ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and -``TimeSeries`` objects also have a ``tshift`` convenience method that changes +``Series`` objects also have a ``tshift`` convenience method that changes all the dates in the index by a specified number of offsets: .. ipython:: python @@ -1060,8 +1157,8 @@ frequency periods. Note that 0.8 marks a watershed in the timeseries functionality in pandas. In previous versions, resampling had to be done using a combination of ``date_range``, ``groupby`` with ``asof``, and then calling an aggregation -function on the grouped object. This was not nearly convenient or performant as -the new pandas timeseries API. +function on the grouped object. This was not nearly as convenient or performant +as the new pandas timeseries API. .. _timeseries.periods: @@ -1099,7 +1196,7 @@ frequency. p - 3 -If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have same freq. Otherise, ``ValueError`` will be raised. +If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherise, ``ValueError`` will be raised. .. ipython:: python @@ -1160,7 +1257,7 @@ objects: ps = Series(randn(len(prng)), prng) ps -``PeriodIndex`` supports addition and subtraction as the same rule as ``Period``. +``PeriodIndex`` supports addition and subtraction with the same rule as ``Period``. .. ipython:: python @@ -1175,7 +1272,7 @@ objects: PeriodIndex Partial String Indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can pass in dates and strings to `Series` and `DataFrame` with `PeriodIndex`, as the same manner as `DatetimeIndex`. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. +You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. .. ipython:: python @@ -1185,7 +1282,7 @@ You can pass in dates and strings to `Series` and `DataFrame` with `PeriodIndex` ps['10/31/2011':'12/31/2011'] -Passing string represents lower frequency than `PeriodIndex` returns partial sliced data. +Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python @@ -1196,7 +1293,7 @@ Passing string represents lower frequency than `PeriodIndex` returns partial sli dfp dfp['2013-01-01 10H'] -As the same as `DatetimeIndex`, the endpoints will be included in the result. Below example slices data starting from 10:00 to 11:59. +As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. .. ipython:: python @@ -1204,7 +1301,7 @@ As the same as `DatetimeIndex`, the endpoints will be included in the result. Be Frequency Conversion and Resampling with PeriodIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The frequency of Periods and PeriodIndex can be converted via the ``asfreq`` +The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq`` method. Let's start with the fiscal year 2011, ending in December: .. ipython:: python @@ -1247,8 +1344,8 @@ period. Period conversions with anchored frequencies are particularly useful for working with various quarterly data common to economics, business, and other fields. Many organizations define quarters relative to the month in which their -fiscal year start and ends. Thus, first quarter of 2011 could start in 2010 or -a few months into 2011. Via anchored frequencies, pandas works all quarterly +fiscal year starts and ends. Thus, first quarter of 2011 could start in 2010 or +a few months into 2011. Via anchored frequencies, pandas works for all quarterly frequencies ``Q-JAN`` through ``Q-DEC``. ``Q-DEC`` define regular calendar quarters: @@ -1354,7 +1451,7 @@ Time Zone Handling ------------------ Pandas provides rich support for working with timestamps in different time zones using ``pytz`` and ``dateutil`` libraries. -``dateutil`` support is new [in 0.14.1] and currently only supported for fixed offset and tzfile zones. The default library is ``pytz``. +``dateutil`` support is new in 0.14.1 and currently only supported for fixed offset and tzfile zones. The default library is ``pytz``. Support for ``dateutil`` is provided for compatibility with other applications e.g. if you use ``dateutil`` in other python packages. Working with Time Zones @@ -1472,7 +1569,7 @@ time zones using ``tz_convert``: rng_berlin[5] rng_eastern[5].tz_convert('Europe/Berlin') -Localization of Timestamps functions just like DatetimeIndex and TimeSeries: +Localization of Timestamps functions just like DatetimeIndex and Series: .. ipython:: python @@ -1480,8 +1577,8 @@ Localization of Timestamps functions just like DatetimeIndex and TimeSeries: rng[5].tz_localize('Asia/Shanghai') -Operations between TimeSeries in different time zones will yield UTC -TimeSeries, aligning the data on the UTC timestamps: +Operations between Series in different time zones will yield UTC +Series, aligning the data on the UTC timestamps: .. ipython:: python diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 9d4cba2e5ee8c..51912b5d6b106 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -6,20 +6,16 @@ import numpy as np import pandas as pd - from numpy.random import randn, rand, randint np.random.seed(123456) - from pandas import DataFrame, Series, date_range, options - import pandas.util.testing as tm np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - plt.close('all') + pd.options.display.max_rows = 15 import matplotlib try: matplotlib.style.use('ggplot') except AttributeError: - options.display.mpl_style = 'default' - options.display.max_rows = 15 - from pandas.compat import lrange + pd.options.display.mpl_style = 'default' + import matplotlib.pyplot as plt + plt.close('all') ******** Plotting @@ -68,7 +64,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -87,7 +83,7 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic.png @@ -105,8 +101,8 @@ You can plot one column versus another using the `x` and `y` keywords in .. ipython:: python - df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() + df3['A'] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png df3.plot(x='A', y='B') @@ -182,7 +178,7 @@ bar plot: .. ipython:: python - df2 = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) @savefig bar_plot_multi_ex.png df2.plot(kind='bar'); @@ -224,8 +220,8 @@ Histogram can be drawn specifying ``kind='hist'``. .. ipython:: python - df4 = DataFrame({'a': randn(1000) + 1, 'b': randn(1000), - 'c': randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), + 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) plt.figure(); @@ -267,7 +263,7 @@ You can pass other keywords supported by matplotlib ``hist``. For example, horiz plt.close('all') See the :meth:`hist ` method and the -`matplotlib hist documenation `__ for more. +`matplotlib hist documentation `__ for more. The existing interface ``DataFrame.hist`` to plot histogram still can be used. @@ -308,10 +304,10 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python - data = Series(randn(1000)) + data = pd.Series(np.random.randn(1000)) @savefig grouped_hist.png - data.hist(by=randint(0, 4, 1000), figsize=(6, 4)) + data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4)) .. _visualization.box: @@ -337,7 +333,7 @@ a uniform random variable on [0,1). .. ipython:: python - df = DataFrame(rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) @savefig box_plot_new.png df.plot(kind='box') @@ -392,7 +388,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python - df = DataFrame(rand(10,5)) + df = pd.DataFrame(np.random.rand(10,5)) plt.figure(); @savefig box_plot_ex.png @@ -410,8 +406,8 @@ groupings. For instance, .. ipython:: python :okwarning: - df = DataFrame(rand(10,2), columns=['Col1', 'Col2'] ) - df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + df = pd.DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) plt.figure(); @@ -430,9 +426,9 @@ columns: .. ipython:: python :okwarning: - df = DataFrame(rand(10,3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) - df['Y'] = Series(['A','B','A','B','A','B','A','B','A','B']) + df = pd.DataFrame(np.random.rand(10,3), columns=['Col1', 'Col2', 'Col3']) + df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) + df['Y'] = pd.Series(['A','B','A','B','A','B','A','B','A','B']) plt.figure(); @@ -473,7 +469,7 @@ DataFrame. :okwarning: np.random.seed(1234) - df_box = DataFrame(np.random.randn(50, 2)) + df_box = pd.DataFrame(np.random.randn(50, 2)) df_box['g'] = np.random.choice(['A', 'B'], size=50) df_box.loc[df_box['g'] == 'B', 1] += 3 @@ -517,7 +513,7 @@ When input data contains `NaN`, it will be automatically filled by 0. If you wan .. ipython:: python - df = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) @savefig area_plot_stacked.png df.plot(kind='area'); @@ -555,7 +551,7 @@ These can be specified by ``x`` and ``y`` keywords each. .. ipython:: python - df = DataFrame(rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) @savefig scatter_plot.png df.plot(kind='scatter', x='a', y='b'); @@ -626,7 +622,7 @@ too dense to plot each point individually. .. ipython:: python - df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) df['b'] = df['b'] + np.arange(1000) @savefig hexbin_plot.png @@ -654,7 +650,7 @@ given by column ``z``. The bins are aggregated with numpy's ``max`` function. .. ipython:: python - df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) df['b'] = df['b'] = df['b'] + np.arange(1000) df['z'] = np.random.uniform(0, 3, 1000) @@ -689,7 +685,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python - series = Series(3 * rand(4), index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') @savefig series_pie_plot.png series.plot(kind='pie', figsize=(6, 6)) @@ -716,7 +712,7 @@ A legend will be drawn in each pie plots by default; specify ``legend=False`` to .. ipython:: python - df = DataFrame(3 * rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame(3 * np.random.rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) @savefig df_pie_plot.png df.plot(kind='pie', subplots=True, figsize=(8, 4)) @@ -759,7 +755,7 @@ If you pass values whose sum total is less than 1.0, matplotlib draws a semicirc .. ipython:: python - series = Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') @savefig series_pie_plot_semi.png series.plot(kind='pie', figsize=(6, 6)) @@ -835,7 +831,7 @@ You can create a scatter plot matrix using the .. ipython:: python from pandas.tools.plotting import scatter_matrix - df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) @savefig scatter_matrix_kde.png scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') @@ -863,7 +859,7 @@ setting ``kind='kde'``: .. ipython:: python - ser = Series(randn(1000)) + ser = pd.Series(np.random.randn(1000)) @savefig kde_plot.png ser.plot(kind='kde') @@ -888,10 +884,9 @@ of the same class will usually be closer together and form larger structures. .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import andrews_curves - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -911,10 +906,9 @@ represents one data point. Points that tend to cluster will appear closer togeth .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import parallel_coordinates - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -946,8 +940,8 @@ implies that the underlying data are not random. plt.figure() - data = Series(0.1 * rand(1000) + - 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) + data = pd.Series(0.1 * np.random.rand(1000) + + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) @savefig lag_plot.png lag_plot(data) @@ -981,7 +975,7 @@ confidence band. plt.figure() - data = Series(0.7 * rand(1000) + + data = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) @savefig autocorrelation_plot.png @@ -1012,7 +1006,7 @@ are what constitutes the bootstrap plot. from pandas.tools.plotting import bootstrap_plot - data = Series(rand(1000)) + data = pd.Series(np.random.rand(1000)) @savefig bootstrap_plot.png bootstrap_plot(data, size=50, samples=500, color='grey') @@ -1042,10 +1036,9 @@ be colored differently. .. ipython:: python - from pandas import read_csv from pandas.tools.plotting import radviz - data = read_csv('data/iris.data') + data = pd.read_csv('data/iris.data') plt.figure() @@ -1095,7 +1088,7 @@ shown by default. .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic_noleg.png @@ -1119,7 +1112,7 @@ You may pass ``logy`` to get a log-scale Y axis. .. ipython:: python - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = np.exp(ts.cumsum()) @savefig series_plot_logy.png @@ -1227,8 +1220,6 @@ in ``pandas.plot_params`` can be used in a `with statement`: .. ipython:: python - import pandas as pd - plt.figure() @savefig ser_plot_suppress_context.png @@ -1325,10 +1316,10 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a :suppress: np.random.seed(123456) - ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() - df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() .. ipython:: python @@ -1410,7 +1401,7 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : .. ipython:: python fig, ax = plt.subplots(1, 1) - df = DataFrame(rand(5, 3), columns=['a', 'b', 'c']) + df = pd.DataFrame(np.random.rand(5, 3), columns=['a', 'b', 'c']) ax.get_xaxis().set_visible(False) # Hide Ticks @savefig line_plot_table_true.png @@ -1482,7 +1473,7 @@ To use the cubehelix colormap, we can simply pass ``'cubehelix'`` to ``colormap= .. ipython:: python - df = DataFrame(randn(1000, 10), index=ts.index) + df = pd.DataFrame(np.random.randn(1000, 10), index=ts.index) df = df.cumsum() plt.figure() @@ -1520,7 +1511,7 @@ Colormaps can also be used other plot types, like bar charts: .. ipython:: python - dd = DataFrame(randn(10, 10)).applymap(abs) + dd = pd.DataFrame(np.random.randn(10, 10)).applymap(abs) dd = dd.cumsum() plt.figure() @@ -1587,8 +1578,8 @@ when plotting a large number of points. .. ipython:: python - price = Series(randn(150).cumsum(), - index=date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series(np.random.randn(150).cumsum(), + index=pd.date_range('2000-1-1', periods=150, freq='B')) ma = pd.rolling_mean(price, 20) mstd = pd.rolling_std(price, 20) @@ -1624,18 +1615,8 @@ Trellis plotting interface .. ipython:: python :suppress: - import numpy as np - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - tips_data = read_csv('data/tips.csv') - iris_data = read_csv('data/iris.data') - from pandas import read_csv - from pandas.tools.plotting import radviz + tips_data = pd.read_csv('data/tips.csv') + iris_data = pd.read_csv('data/iris.data') plt.close('all') @@ -1646,8 +1627,7 @@ Trellis plotting interface .. code-block:: python - from pandas import read_csv - tips_data = read_csv('tips.csv') + tips_data = pd.read_csv('tips.csv') from the directory where you downloaded the file. @@ -1668,7 +1648,6 @@ In the example below, data from the tips data set is arranged by the attributes values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. - .. ipython:: python plt.figure() diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d05c19a5e4bea..c8e32ac2a3309 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.16.2.txt + .. include:: whatsnew/v0.16.1.txt .. include:: whatsnew/v0.16.0.txt diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 02de919e3f83e..6a14a4024ba5a 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -49,7 +49,7 @@ API changes In [3]: cat = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) In [4]: cat - Out[4]: + Out[4]: [a, b, a] Categories (3, object): [a < b < c] diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index aa35434802799..f9bef3d9c7f4a 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -474,10 +474,11 @@ Other API Changes - ``Series.values_counts`` and ``Series.describe`` for categorical data will now put ``NaN`` entries at the end. (:issue:`9443`) - ``Series.describe`` for categorical data will now give counts and frequencies of 0, not ``NaN``, for unused categories (:issue:`9443`) -- Due to a bug fix, looking up a partial string label with ``DatetimeIndex.asof`` now includes values that match the string, even if they are after the start of the partial string label (:issue:`9258`). Old behavior: +- Due to a bug fix, looking up a partial string label with ``DatetimeIndex.asof`` now includes values that match the string, even if they are after the start of the partial string label (:issue:`9258`). - .. ipython:: python - :verbatim: + Old behavior: + + .. code-block:: python In [4]: pd.to_datetime(['2000-01-31', '2000-02-28']).asof('2000-02') Out[4]: Timestamp('2000-01-31 00:00:00') diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt old mode 100644 new mode 100755 index 05c762b91b925..fa82a90f2a429 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -1,37 +1,262 @@ .. _whatsnew_0161: -v0.16.1 (April ??, 2015) ------------------------- +v0.16.1 (May 11, 2015) +---------------------- This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. +Highlights include: + +- Support for a ``CategoricalIndex``, a category based index, see :ref:`here ` +- New section on how-to-contribute to *pandas*, see :ref:`here ` +- Revised "Merge, join, and concatenate" documentation, including graphical examples to make it easier to understand each operations, see :ref:`here ` +- New method ``sample`` for drawing random samples from Series, DataFrames and Panels. See :ref:`here ` +- The default ``Index`` printing has changed to a more uniform format, see :ref:`here ` +- ``BusinessHour`` datetime-offset is now supported, see :ref:`here ` + +- Further enhancement to the ``.str`` accessor to make string operations easier, see :ref:`here ` + .. contents:: What's new in v0.16.1 :local: :backlinks: none - .. _whatsnew_0161.enhancements: +.. warning:: + + In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable package. See :ref:`here for details ` (:issue:`8961`) + Enhancements ~~~~~~~~~~~~ -- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) +.. _whatsnew_0161.enhancements.categoricalindex: + +CategoricalIndex +^^^^^^^^^^^^^^^^ + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython :: python + + df = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('cab')) + }) + df + df.dtypes + df.B.cat.categories + +setting the index, will create create a ``CategoricalIndex`` + +.. ipython :: python + + df2 = df.set_index('B') + df2.index + +indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an Index with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython :: python + + df2.loc['a'] + +and preserves the ``CategoricalIndex`` + +.. ipython :: python + + df2.loc['a'].index + +sorting will order by the order of the categories + +.. ipython :: python + + df2.sort_index() + +groupby operations on the index will preserve the index nature as well + +.. ipython :: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +See the :ref:`documentation ` for more. (:issue:`7629`, :issue:`10038`, :issue:`10039`) + +.. _whatsnew_0161.enhancements.sample: + +Sample +^^^^^^ + +Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`. +The method accepts a specific number of rows or columns to return, or a fraction of the +total number or rows or columns. It also has options for sampling with or without replacement, +for passing in a column for weights for non-uniform sampling, and for setting seed values to +facilitate replication. (:issue:`2419`) + +.. ipython :: python + + example_series = Series([0,1,2,3,4,5]) + + # When no arguments are passed, returns 1 + example_series.sample() + + # One may specify either a number of rows: + example_series.sample(n=3) + + # Or a fraction of the rows: + example_series.sample(frac=0.5) + + # weights are accepted. + example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] + example_series.sample(n=3, weights=example_weights) + + # weights will also be normalized if they do not sum to one, + # and missing values will be treated as zeros. + example_weights2 = [0.5, 0, 0, 0, None, np.nan] + example_series.sample(n=1, weights=example_weights2) + + +When applied to a DataFrame, one may pass the name of a column to specify sampling weights +when sampling from rows. + +.. ipython :: python + + df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) + df.sample(n=3, weights='weight_column') + + +.. _whatsnew_0161.enhancements.string: + +String Methods Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:ref:`Continuing from v0.16.0 `, the following +enhancements make string operations easier and more consistent with standard python string operations. + + +- Added ``StringMethods`` (``.str`` accessor) to ``Index`` (:issue:`9068`) + + The ``.str`` accessor is now available for both ``Series`` and ``Index``. + + .. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + + One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor + will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression + to work naturally: + + .. ipython:: python + + idx = Index(['a1', 'a2', 'b1', 'b2']) + s = Series(range(4), index=idx) + s + idx.str.startswith('a') + s[s.index.str.startswith('a')] + +- The following new methods are accesible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) + + ================ =============== =============== =============== ================ + .. .. Methods .. .. + ================ =============== =============== =============== ================ + ``capitalize()`` ``swapcase()`` ``normalize()`` ``partition()`` ``rpartition()`` + ``index()`` ``rindex()`` ``translate()`` + ================ =============== =============== =============== ================ + +- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`) + + .. ipython:: python + + s = Series(['a,b', 'a,c', 'b,c']) + + # return Series + s.str.split(',') + + # return DataFrame + s.str.split(',', expand=True) + + idx = Index(['a,b', 'a,c', 'b,c']) + + # return Index + idx.str.split(',') + + # return MultiIndex + idx.str.split(',', expand=True) + + +- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) + + +.. _whatsnew_0161.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ + +- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`) + + .. ipython:: python + + from pandas.tseries.offsets import BusinessHour + Timestamp('2014-08-01 09:00') + BusinessHour() + Timestamp('2014-08-01 07:00') + BusinessHour() + Timestamp('2014-08-01 16:30') + BusinessHour() + +- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) + +- Allow ``clip``, ``clip_lower``, and ``clip_upper`` to accept array-like arguments as thresholds (This is a regression from 0.11.0). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). (:issue:`6966`) - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) +- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) + .. ipython:: python + df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) + df.drop(['A', 'X'], axis=1, errors='ignore') -.. _whatsnew_0161.api: +- Add support for separating years and quarters using dashes, for + example 2014-Q1. (:issue:`9688`) -API changes -~~~~~~~~~~~ +- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) +- ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) + +- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) +- Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) +- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) +- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) +- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) +- Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` +- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) + + +.. _whatsnew_0161.api: + +API changes +~~~~~~~~~~~ - When passing in an ax to ``df.plot( ..., ax=ax)``, the `sharex` kwarg will now default to `False`. The result is that the visibility of xlabels and xticklabels will not anymore be changed. You @@ -40,43 +265,282 @@ API changes If pandas creates the subplots itself (e.g. no passed in `ax` kwarg), then the default is still ``sharex=True`` and the visibility changes are applied. +- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously + the order was arbitrary. (:issue:`9777`) +- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) -- Add support for separating years and quarters using dashes, for - example 2014-Q1. (:issue:`9688`) +.. _whatsnew_0161.deprecations: -.. _whatsnew_0161.performance: +Deprecations +^^^^^^^^^^^^ -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ +- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`) +.. _whatsnew_0161.index_repr: +Index Representation +~~~~~~~~~~~~~~~~~~~~ +The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanges (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`) +Previous Behavior -.. _whatsnew_0161.bug_fixes: +.. code-block:: python -Bug Fixes -~~~~~~~~~ + In [2]: pd.Index(range(4),name='foo') + Out[2]: Int64Index([0, 1, 2, 3], dtype='int64') -- Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated. -- Bug in json serialization when frame has length zero.(:issue:`9805`) + In [3]: pd.Index(range(104),name='foo') + Out[3]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...], dtype='int64') + In [4]: pd.date_range('20130101',periods=4,name='foo',tz='US/Eastern') + Out[4]: + + [2013-01-01 00:00:00-05:00, ..., 2013-01-04 00:00:00-05:00] + Length: 4, Freq: D, Timezone: US/Eastern -- Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) + In [5]: pd.date_range('20130101',periods=104,name='foo',tz='US/Eastern') + Out[5]: + + [2013-01-01 00:00:00-05:00, ..., 2013-04-14 00:00:00-04:00] + Length: 104, Freq: D, Timezone: US/Eastern +New Behavior +.. ipython:: python + pd.set_option('display.width', 80) + pd.Index(range(4), name='foo') + pd.Index(range(30), name='foo') + pd.Index(range(104), name='foo') + pd.CategoricalIndex(['a','bb','ccc','dddd'], ordered=True, name='foobar') + pd.CategoricalIndex(['a','bb','ccc','dddd']*10, ordered=True, name='foobar') + pd.CategoricalIndex(['a','bb','ccc','dddd']*100, ordered=True, name='foobar') + pd.date_range('20130101',periods=4, name='foo', tz='US/Eastern') + pd.date_range('20130101',periods=25, freq='D') + pd.date_range('20130101',periods=104, name='foo', tz='US/Eastern') -- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) +.. _whatsnew_0161.performance: -- Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`) -- Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`) +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) +- Improved csv write performance generally by 2x (:issue:`9940`) +- Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`) -- Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) +.. _whatsnew_0161.bug_fixes: + +Bug Fixes +~~~~~~~~~ +- Bug where labels did not appear properly in the legend of ``DataFrame.plot()``, passing ``label=`` arguments works, and Series indices are no longer mutated. (:issue:`9542`) +- Bug in json serialization causing a segfault when a frame had zero length. (:issue:`9805`) +- Bug in ``read_csv`` where missing trailing delimiters would cause segfault. (:issue:`5664`) +- Bug in retaining index name on appending (:issue:`9862`) +- Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) +- Fixed bug in ``StataWriter`` resulting in changes to input ``DataFrame`` upon save (:issue:`9795`). +- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) +- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`) +- Bug in grouping with multiple ``pd.Grouper`` where one is non-time based (:issue:`10063`) +- Bug in ``read_sql_table`` error when reading postgres table with timezone (:issue:`7139`) +- Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`) +- Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`) +- Bug with ``TimedeltaIndex`` constructor ignoring ``name`` when given another ``TimedeltaIndex`` as data (:issue:`10025`). +- Bug in ``DataFrameFormatter._get_formatted_index`` with not applying ``max_colwidth`` to the ``DataFrame`` index (:issue:`7856`) +- Bug in ``.loc`` with a read-only ndarray data source (:issue:`10043`) +- Bug in ``groupby.apply()`` that would raise if a passed user defined function either returned only ``None`` (for all input). (:issue:`9685`) +- Always use temporary files in pytables tests (:issue:`9992`) +- Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) +- Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`) +- Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`) +- Bug in ``setup.py`` that would allow an incompat cython version to build (:issue:`9827`) +- Bug in plotting ``secondary_y`` incorrectly attaches ``right_ax`` property to secondary axes specifying itself recursively. (:issue:`9861`) - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) +<<<<<<< HEAD + +- Bug in ``ParserBase.convert_to_nd_arrays`` when called by ``DataFrame.read_fwf`` (:issue:`9266``) +======= +- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> ce988b4... DOC: Update whatsnew for 0.16.1 (#9764) +======= + +======= +>>>>>>> 39fa180... FIX: timeseries asfreq would drop the name of the index, closes #9854 +- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> f00d6bb... Fixed bug #9671 where 'DataFrame.plot()' raised an error when both 'color' and 'style' keywords were passed and there was no color symbol in the style strings (this should be allowed) +======= +======= + +>>>>>>> 2997e70... BUG: read_csv skips lines with initial whitespace + one non-space character (GH9710) +======= +- Not showing a ``DeprecationWarning`` on combining list-likes with an ``Index`` (:issue:`10083`) +>>>>>>> 90a3f26... DOC: additional whatsnew +- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> e67893f... BUG: skiprows doesn't handle blank lines properly when engine='c' (GH #9832) +======= + +======= +>>>>>>> 39fa180... FIX: timeseries asfreq would drop the name of the index, closes #9854 +- Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 53f2ea4... BUG: Issue 9798 fixed +======= + +======= +- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9875`) +>>>>>>> 07257a0... BUG: Fixing == __eq__ operator for MultiIndex ... closes (GH9785) +======= +- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) +>>>>>>> ad1abce... DOC: fix incorrect issue numbers in whatsnew +- Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) +<<<<<<< HEAD +<<<<<<< HEAD + +<<<<<<< HEAD +>>>>>>> 7879205... Fix to allow sparse dataframes to have nan column labels +======= +======= +- Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) +======= +>>>>>>> 1f9b699... BUG: where behaves badly when dtype of self is datetime or timedelta, and dtype of other is not (GH9804) +- Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 39fa180... FIX: timeseries asfreq would drop the name of the index, closes #9854 +- Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`) +<<<<<<< HEAD +>>>>>>> f0ac930... Fix: unequal comparisons of categorical and scalar +======= +======= +>>>>>>> ad1abce... DOC: fix incorrect issue numbers in whatsnew + +======= +>>>>>>> fad6079... DOC: last clean-up of whatsnew file 0.16.1 +- Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) +- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) +- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) +- Bug in C csv parser causing spurious NaNs when data started with newline followed by whitespace. (:issue:`10022`) +- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`) +- Bug where .iloc and .loc behavior is not consistent on empty dataframes (:issue:`9964`) +- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`) +- Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`) +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 35b20d8... BUG: Fix for comparisons of categorical and an scalar not in categories, xref GH9836 +======= + +======= + +======= +>>>>>>> a97113c... ENH: Raise error on trying to write excel file with a MultiIndexed DataFrame. closes #9794 +- Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`) +- Bug in ``where`` when dtype is ``datetime64/timedelta64``, but dtype of other is not (:issue:`9804`) +<<<<<<< HEAD +>>>>>>> 1f9b699... BUG: where behaves badly when dtype of self is datetime or timedelta, and dtype of other is not (GH9804) +- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9875`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> f72aa71... BUG: unstack on unicode name level breaks GH9856 +======= + +======= +>>>>>>> 39fa180... FIX: timeseries asfreq would drop the name of the index, closes #9854 +======= +- Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9856`) +>>>>>>> ad1abce... DOC: fix incorrect issue numbers in whatsnew +- Bug in which ``groupby.transform`` incorrectly enforced output dtypes to match input dtypes. (:issue:`9807`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 30580e7... Groupby transform preserves output dtype +======= + +======= +- Bug in ``DataFrame`` constructor when ``columns`` parameter is set, and ``data`` is an empty list (:issue:`9939`) +>>>>>>> 514fe2d... BUG: DataFrame constructor fails when columns is set and data=[] (GH9948/9939) +- Bug in bar plot with ``log=True`` raises ``TypeError`` if all values are less than 1 (:issue:`9905`) +- Bug in horizontal bar plot ignores ``log=True`` (:issue:`9905`) +- Bug in PyTables queries that did not return proper results using the index (:issue:`8265`, :issue:`9676`) +- Bug where dividing a dataframe containing values of type ``Decimal`` by another ``Decimal`` would raise. (:issue:`9787`) +<<<<<<< HEAD +>>>>>>> bed38f2... FIX: division of Decimal would crash on fill because Decimal does not support type or dtype. (GH9787) +======= +- Bug where using DataFrames asfreq would remove the name of the index. (:issue:`9885`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 39fa180... FIX: timeseries asfreq would drop the name of the index, closes #9854 +======= + +- Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`) +>>>>>>> 99aabee... What's new and doc +======= +======= +- Bug causing extra index point when resample BM/BQ (:issue:`9756`) +>>>>>>> c0d4339... BUG: Resample BM/BQ adds extra index point #9756 +- Changed caching in ``AbstractHolidayCalendar`` to be at the instance level rather than at the class level as the latter can result in unexpected behaviour. (:issue:`9552`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> b1e8d9f... Moved caching in `AbstractHolidayCalendar` to the instance level +======= + +======= +>>>>>>> fad6079... DOC: last clean-up of whatsnew file 0.16.1 +- Fixed latex output for multi-indexed dataframes (:issue:`9778`) +<<<<<<< HEAD +>>>>>>> 4d1268e... BUG: Fixed latex output for multi-indexed dataframes - GH9778 +======= +- Bug causing an exception when setting an empty range using ``DataFrame.loc`` (:issue:`9596`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> a21f2ce... BUG: Exception when setting an empty range using DataFrame.loc +======= + + + + +======= +>>>>>>> fad6079... DOC: last clean-up of whatsnew file 0.16.1 +- Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`) +<<<<<<< HEAD +>>>>>>> d3ccb70... BUG: hidden ticklabels with sharex and secondary +======= +- Bug in ``transform`` and ``filter`` when grouping on a categorical variable (:issue:`9921`) +- Bug in ``transform`` when groups are equal in number and dtype to the input index (:issue:`9700`) +<<<<<<< HEAD +>>>>>>> 3d73550... BUG: transform and filter misbehave when grouping on categorical data (GH 9921) +======= +- Google BigQuery connector now imports dependencies on a per-method basis.(:issue:`9713`) +- Updated BigQuery connector to no longer use deprecated ``oauth2client.tools.run()`` (:issue:`8327`) +<<<<<<< HEAD +>>>>>>> 2cf4132... Updates to Google BigQuery connector (#9713, #8327) +======= +- Bug in subclassed ``DataFrame``. It may not return the correct class, when slicing or subsetting it. (:issue:`9632`) +<<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 5805889... Return correct subclass when slicing DataFrame. +======= +- BUG in median() where non-float null values are not handled correctly (:issue:`10040`) +>>>>>>> df730a3... BUG: median() not correctly handling non-float null values (fixes #10040) +======= +- Bug in ``.median()`` where non-float null values are not handled correctly (:issue:`10040`) +<<<<<<< HEAD +>>>>>>> 6c80f68... DOC: prepare for 0.16.1 release +======= +- Bug in Series.fillna() where it raises if a numerically convertible string is given (:issue:`10092`) +>>>>>>> 8ccc9b3... BUG: Series.fillna() raises if given a numerically convertible string diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt new file mode 100644 index 0000000000000..b571aab0b19a5 --- /dev/null +++ b/doc/source/whatsnew/v0.16.2.txt @@ -0,0 +1,85 @@ +.. _whatsnew_0162: + +v0.16.2 (June 12, 2015) +----------------------- + +This is a minor bug-fix release from 0.16.1 and includes a a large number of +bug fixes along several new features, enhancements, and performance improvements. +We recommend that all users upgrade to this version. + +Highlights include: + +Check the :ref:`API Changes ` before updating. + +.. contents:: What's new in v0.16.2 + :local: + :backlinks: none + +.. _whatsnew_0162.enhancements: + +New features +~~~~~~~~~~~~ + +.. _whatsnew_0162.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0162.api: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0162.api_breaking: + +.. _whatsnew_0162.api_breaking.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in constructor. (:issue:`102171`) + +.. _whatsnew_0162.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved ``Series.resample`` performance with dtype=datetime64[ns] (:issue:`7754`) + +.. _whatsnew_0162.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- Bug where read_hdf store.select modifies the passed columns list when + multi-indexed (:issue:`7212`) +- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) + + +- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) +- Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) +- Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`) +- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) +- Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) + + +- Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) +- Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) + +- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) + +- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) + + +- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) + +- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) + +- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`) +- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`) + + +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) + + +- Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt new file mode 100644 index 0000000000000..6ad108dc020c2 --- /dev/null +++ b/doc/source/whatsnew/v0.17.0.txt @@ -0,0 +1,60 @@ +.. _whatsnew_0170: + +v0.17.0 (July 31, 2015) +----------------------- + +This is a major release from 0.16.2 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.17.0 + :local: + :backlinks: none + +.. _whatsnew_0170.enhancements: + +New features +~~~~~~~~~~~~ + +.. _whatsnew_0170.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.api: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0170.api_breaking: + +.. _whatsnew_0170.api_breaking.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.deprecations: + +Deprecations +^^^^^^^^^^^^ + +.. _whatsnew_0170.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0170.bug_fixes: + +Bug Fixes +~~~~~~~~~ +fixed bug in csv parsing when using a converting that specified uint8 (:issue: '9266') diff --git a/pandas/__init__.py b/pandas/__init__.py index 939495d3687ad..2a142a6ff2072 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -4,17 +4,13 @@ __docformat__ = 'restructuredtext' try: - from . import hashtable, tslib, lib -except Exception: # pragma: no cover - import sys - e = sys.exc_info()[1] # Py25 and Py3 current exception syntax conflict - print(e) - if 'No module named lib' in str(e): - raise ImportError('C extensions not built: if you installed already ' - 'verify that you are not importing from the source ' - 'directory') - else: - raise + from pandas import hashtable, tslib, lib +except ImportError as e: # pragma: no cover + module = str(e).lstrip('cannot import name ') # hack but overkill to use re + raise ImportError("C extension: {0} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace' to build the C " + "extensions first.".format(module)) from datetime import datetime import numpy as np diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index bff6eb1f95abc..2a273629544cb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,6 +26,7 @@ Other items: * OrderedDefaultDict +* platform checker """ # pylint disable=W0611 import functools @@ -37,6 +38,8 @@ PY3 = (sys.version_info[0] >= 3) PY3_2 = sys.version_info[:2] == (3, 2) +PY2 = sys.version_info[0] == 2 + try: import __builtin__ as builtins @@ -752,3 +755,16 @@ def __missing__(self, key): def __reduce__(self): # optional, for pickle support args = self.default_factory if self.default_factory else tuple() return type(self), args, None, None, list(self.items()) + + +# https://github.com/pydata/pandas/pull/9123 +def is_platform_windows(): + return sys.platform == 'win32' or sys.platform == 'cygwin' + + +def is_platform_linux(): + return sys.platform == 'linux2' + + +def is_platform_mac(): + return sys.platform == 'darwin' diff --git a/pandas/core/api.py b/pandas/core/api.py index a8b10342593ce..fde9bc77c4bd9 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/base.py b/pandas/core/base.py index dde2e74132c4b..540b900844a9e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,16 +1,14 @@ """ Base and utility classes for pandas objects. """ -import datetime - from pandas import compat import numpy as np from pandas.core import common as com import pandas.core.nanops as nanops -import pandas.tslib as tslib import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly - +from pandas.core.strings import StringMethods +from pandas.core.common import AbstractMethodError _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', @@ -31,7 +29,7 @@ class StringMixin(object): # Formatting def __unicode__(self): - raise NotImplementedError + raise AbstractMethodError(self) def __str__(self): """ @@ -85,16 +83,22 @@ def __unicode__(self): # Should be overwritten by base classes return object.__repr__(self) - def _local_dir(self): - """ provide addtional __dir__ for this object """ - return [] + def _dir_additions(self): + """ add addtional __dir__ for this object """ + return set() + + def _dir_deletions(self): + """ delete unwanted __dir__ for this object """ + return set() def __dir__(self): """ Provide method name lookup and completion Only provide 'public' methods """ - return list(sorted(list(set(dir(type(self)) + self._local_dir())))) + rv = set(dir(type(self))) + rv = (rv - self._dir_deletions()) | self._dir_additions() + return sorted(rv) def _reset_cache(self, key=None): """ @@ -120,7 +124,7 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ): + def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): """ add accessors to cls from the delegate class @@ -130,6 +134,8 @@ def _add_delegate_accessors(cls, delegate, accessors, typ): delegate : the class to get methods/properties & doc-strings acccessors : string list of accessors to add typ : 'property' or 'method' + overwrite : boolean, default False + overwrite the method/property in the target class if it exists """ @@ -163,7 +169,7 @@ def f(self, *args, **kwargs): f = _create_delegator_method(name) # don't overwrite existing methods/properties - if not hasattr(cls, name): + if overwrite or not hasattr(cls, name): setattr(cls,name,f) @@ -497,6 +503,41 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) + # string methods + def _make_str_accessor(self): + from pandas.core.series import Series + from pandas.core.index import Index + if isinstance(self, Series) and not com.is_object_dtype(self.dtype): + # this really should exclude all series with any non-string values, + # but that isn't practical for performance reasons until we have a + # str dtype (GH 9343) + raise AttributeError("Can only use .str accessor with string " + "values, which use np.object_ dtype in " + "pandas") + elif isinstance(self, Index): + # see scc/inferrence.pyx which can contain string values + allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') + if self.inferred_type not in allowed_types: + message = ("Can only use .str accessor with string values " + "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") + raise AttributeError(message) + if self.nlevels > 1: + message = "Can only use .str accessor with Index, not MultiIndex" + raise AttributeError(message) + return StringMethods(self) + + str = AccessorProperty(StringMethods, _make_str_accessor) + + def _dir_additions(self): + return set() + + def _dir_deletions(self): + try: + getattr(self, 'str') + except AttributeError: + return set(['str']) + return set() + _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed @@ -547,4 +588,4 @@ def duplicated(self, take_last=False): # abstracts def _update_inplace(self, result, **kwargs): - raise NotImplementedError + raise AbstractMethodError(self) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 991678a8e7d79..c5cd8390359dc 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -9,20 +9,18 @@ from pandas.core.algorithms import factorize from pandas.core.base import PandasObject, PandasDelegate -from pandas.core.index import Index, _ensure_index -from pandas.tseries.period import PeriodIndex import pandas.core.common as com -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate_kwarg -from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull, +from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, + isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, _ensure_object, _ensure_int64, - _coerce_indexer_dtype, _values_from_object, take_1d) + _coerce_indexer_dtype, take_1d) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option -from pandas.core import format as fmt def _cat_compare_op(op): def f(self, other): @@ -61,7 +59,14 @@ def f(self, other): i = self.categories.get_loc(other) return getattr(self._codes, op)(i) else: - return np.repeat(False, len(self)) + if op == '__eq__': + return np.repeat(False, len(self)) + elif op == '__ne__': + return np.repeat(True, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with a scalar, " \ + "which is not a category." + raise TypeError(msg.format(op=op)) else: # allow categorical vs object dtype array comparisons for equality @@ -79,7 +84,7 @@ def f(self, other): def maybe_to_categorical(array): """ coerce to a categorical if a series is given """ - if isinstance(array, ABCSeries): + if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array.values return array @@ -229,15 +234,17 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # sanitize input if is_categorical_dtype(values): - # we are either a Series or a Categorical - cat = values - if isinstance(values, ABCSeries): - cat = values.values + # we are either a Series or a CategoricalIndex + if isinstance(values, (ABCSeries, ABCCategoricalIndex)): + values = values.values + + if ordered is None: + ordered = values.ordered if categories is None: - categories = cat.categories + categories = values.categories values = values.__array__() - elif isinstance(values, Index): + elif isinstance(values, ABCIndexClass): pass else: @@ -288,11 +295,11 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - if is_integer_dtype(values) and (codes == -1).all(): + if len(values) and is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - self.set_ordered(ordered, inplace=True) + self.set_ordered(ordered or False, inplace=True) self.categories = categories self.name = name self._codes = _coerce_indexer_dtype(codes, categories) @@ -302,11 +309,27 @@ def copy(self): return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) + def astype(self, dtype): + """ coerce this type to another dtype """ + if is_categorical_dtype(dtype): + return self + return np.array(self, dtype=dtype) + @cache_readonly def ndim(self): """Number of dimensions of the Categorical """ return self._codes.ndim + @cache_readonly + def size(self): + """ return the len of myself """ + return len(self) + + @cache_readonly + def itemsize(self): + """ return the size of a single category """ + return self.categories.itemsize + def reshape(self, new_shape, **kwargs): """ compat with .reshape """ return self @@ -388,7 +411,8 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _get_labels(self): - """ Get the category labels (deprecated). + """ + Get the category labels (deprecated). Deprecated, use .codes! """ @@ -402,8 +426,10 @@ def _get_labels(self): @classmethod def _validate_categories(cls, categories): - """" Validates that we have good categories """ - if not isinstance(categories, Index): + """ + Validates that we have good categories + """ + if not isinstance(categories, ABCIndexClass): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) @@ -414,6 +440,8 @@ def _validate_categories(cls, categories): with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" + + from pandas import Index categories = Index(categories, dtype=dtype) if not categories.is_unique: raise ValueError('Categorical categories must be unique') @@ -680,7 +708,7 @@ def add_categories(self, new_categories, inplace=False): if len(already_included) != 0: msg = "new categories must not include old categories: %s" % str(already_included) raise ValueError(msg) - new_categories = list(self._categories) + (new_categories) + new_categories = list(self._categories) + list(new_categories) new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() cat._categories = new_categories @@ -754,6 +782,8 @@ def remove_unused_categories(self, inplace=False): cat = self if inplace else self.copy() _used = sorted(np.unique(cat._codes)) new_categories = cat.categories.take(_ensure_platform_int(_used)) + + from pandas.core.index import _ensure_index new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) cat._categories = new_categories @@ -783,7 +813,8 @@ def shape(self): return tuple([len(self._codes)]) def __array__(self, dtype=None): - """ The numpy array interface. + """ + The numpy array interface. Returns ------- @@ -792,7 +823,7 @@ def __array__(self, dtype=None): dtype as categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) - if dtype and dtype != self.categories.dtype: + if dtype and not is_dtype_equal(dtype,self.categories.dtype): return np.asarray(ret, dtype) return ret @@ -990,7 +1021,7 @@ def get_values(self): """ # if we are a period index, return a string repr - if isinstance(self.categories, PeriodIndex): + if isinstance(self.categories, ABCPeriodIndex): return take_1d(np.array(self.categories.to_native_types(), dtype=object), self._codes) @@ -1137,7 +1168,8 @@ def to_dense(self): """ return np.asarray(self) - def fillna(self, fill_value=None, method=None, limit=None): + @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. Parameters @@ -1149,17 +1181,24 @@ def fillna(self, fill_value=None, method=None, limit=None): value : scalar Value to use to fill holes (e.g. 0) limit : int, default None - Maximum size gap to forward or backward fill (not implemented yet!) + (Not implemented yet for Categorical!) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Returns ------- filled : Categorical with NA/NaN filled """ - if fill_value is None: - fill_value = np.nan + if value is None: + value = np.nan if limit is not None: - raise NotImplementedError + raise NotImplementedError("specifying a limit for fillna has not " + "been implemented yet") values = self._codes @@ -1171,24 +1210,23 @@ def fillna(self, fill_value=None, method=None, limit=None): # we only have one NA in categories values[values == nan_pos] = -1 - # pad / bfill if method is not None: - values = self.to_dense().reshape(-1,len(self)) + values = self.to_dense().reshape(-1, len(self)) values = com.interpolate_2d( - values, method, 0, None, fill_value).astype(self.categories.dtype)[0] + values, method, 0, None, value).astype(self.categories.dtype)[0] values = _get_codes_for_values(values, self.categories) else: - if not isnull(fill_value) and fill_value not in self.categories: + if not isnull(value) and value not in self.categories: raise ValueError("fill value must be in categories") mask = values==-1 if mask.any(): values = values.copy() - values[mask] = self.categories.get_loc(fill_value) + values[mask] = self.categories.get_loc(value) return Categorical(values, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) @@ -1235,7 +1273,8 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(np.array(self)) - def _tidy_repr(self, max_vals=10): + def _tidy_repr(self, max_vals=10, footer=True): + """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, name=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, @@ -1243,28 +1282,35 @@ def _tidy_repr(self, max_vals=10): footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) - result = '%s\n%s' % (result, self._repr_footer()) + if footer: + result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) - def _repr_categories_info(self): - """ Returns a string representation of the footer.""" - + def _repr_categories(self): + """ return the base repr for the categories """ max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) + from pandas.core import format as fmt category_strs = fmt.format_array(self.categories.get_values(), None) if len(category_strs) > max_categories: num = max_categories // 2 head = category_strs[:num] tail = category_strs[-(max_categories - num):] category_strs = head + ["..."] + tail + # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] + return category_strs + + def _repr_categories_info(self): + """ Returns a string representation of the footer.""" + + category_strs = self._repr_categories() levheader = "Categories (%d, %s): " % (len(self.categories), self.categories.dtype) width, height = get_terminal_size() - max_width = (width if get_option("display.width") == 0 - else get_option("display.width")) + max_width = get_option("display.width") or width if com.in_ipython_frontend(): # 0 = no breaks max_width = 0 @@ -1291,8 +1337,11 @@ def _repr_footer(self): len(self), self._repr_categories_info()) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): - formatter = fmt.CategoricalFormatter(self, name=name, - length=length, na_rep=na_rep, + from pandas.core import format as fmt + formatter = fmt.CategoricalFormatter(self, + name=name, + length=length, + na_rep=na_rep, footer=footer) result = formatter.to_string() return compat.text_type(result) @@ -1307,9 +1356,9 @@ def __unicode__(self): name=True) else: result = '[], %s' % self._get_repr(name=True, - length=False, - footer=True, - ).replace("\n",", ") + length=False, + footer=True, + ).replace("\n",", ") return result @@ -1350,6 +1399,8 @@ def __setitem__(self, key, value): "categories") rvalue = value if is_list_like(value) else [value] + + from pandas import Index to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set something to np.nan @@ -1508,11 +1559,27 @@ def equals(self, other): ------- are_equal : boolean """ - if not isinstance(other, Categorical): - return False # TODO: should this also test if name is equal? - return (self.categories.equals(other.categories) and self.ordered == other.ordered and - np.array_equal(self._codes, other._codes)) + return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) + + def is_dtype_equal(self, other): + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + + try: + return self.categories.equals(other.categories) and self.ordered == other.ordered + except (AttributeError, TypeError): + return False def describe(self): """ Describes this Categorical @@ -1596,18 +1663,20 @@ def _delegate_method(self, name, *args, **kwargs): ##### utility routines ##### def _get_codes_for_values(values, categories): - """" + """ utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != categories.dtype: + if not is_dtype_equal(values.dtype,categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(categories)) - t.map_locations(_values_from_object(categories)) - return _coerce_indexer_dtype(t.lookup(values), categories) + (_, _), cats = _get_data_algo(categories, _hashtables) + t = hash_klass(len(cats)) + t.map_locations(cats) + return _coerce_indexer_dtype(t.lookup(vals), cats) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): diff --git a/pandas/core/common.py b/pandas/core/common.py index ec805aba34d48..1c9326c047a79 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -39,6 +39,17 @@ class AmbiguousIndexError(PandasError, KeyError): pass +class AbstractMethodError(NotImplementedError): + """Raise this error instead of NotImplementedError for abstract methods + while keeping compatibility with Python 2 and Python 3. + """ + def __init__(self, class_instance): + self.class_instance = class_instance + + def __str__(self): + return "This method must be defined on the concrete class of " \ + + self.class_instance.__class__.__name__ + _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name for t in ['O', 'int8', 'uint8', 'int16', 'uint16', 'int32', @@ -72,6 +83,16 @@ def _check(cls, inst): ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) +ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", + "int64index", + "float64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex")) + ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) @@ -1397,14 +1418,19 @@ def _fill_zeros(result, x, y, name, fill): mask the nan's from x """ - if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x,y = y,x - if np.isscalar(y): + is_typed_variable = (hasattr(y, 'dtype') or hasattr(y,'type')) + is_scalar = lib.isscalar(y) + + if not is_typed_variable and not is_scalar: + return result + + if is_scalar: y = np.array(y) if is_integer_dtype(y): @@ -2439,8 +2465,27 @@ def _get_dtype_type(arr_or_dtype): return np.dtype(arr_or_dtype).type elif isinstance(arr_or_dtype, CategoricalDtype): return CategoricalDtypeType - return arr_or_dtype.dtype.type + elif isinstance(arr_or_dtype, compat.string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) + try: + return arr_or_dtype.dtype.type + except AttributeError: + raise ValueError('%r is not a dtype' % arr_or_dtype) + +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + source = _get_dtype_type(source) + target = _get_dtype_type(target) + + try: + return source == target + except TypeError: + # invalid comparison + # object == category will hit this + return False def is_any_int_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) @@ -2510,7 +2555,11 @@ def is_floating_dtype(arr_or_dtype): def is_bool_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) + try: + tipo = _get_dtype_type(arr_or_dtype) + except ValueError: + # this isn't even a dtype + return False return issubclass(tipo, np.bool_) def is_categorical(array): @@ -2637,7 +2686,12 @@ def _astype_nansafe(arr, dtype, copy=True): if not isinstance(dtype, np.dtype): dtype = _coerce_to_dtype(dtype) - if is_datetime64_dtype(arr): + if issubclass(dtype.type, compat.text_type): + # in Py3 that's str, in Py2 that's unicode + return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + elif issubclass(dtype.type, compat.string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + elif is_datetime64_dtype(arr): if dtype == object: return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: @@ -2675,11 +2729,6 @@ def _astype_nansafe(arr, dtype, copy=True): elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): # work around NumPy brokenness, #1987 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - elif issubclass(dtype.type, compat.text_type): - # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) - elif issubclass(dtype.type, compat.string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) if copy: return arr.astype(dtype) @@ -3083,7 +3132,7 @@ def in_ipython_frontend(): # working with straight ascii. -def _pprint_seq(seq, _nest_lvl=0, **kwds): +def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): """ internal. pprinter for iterables. you should probably use pprint_thing() rather then calling this directly. @@ -3095,12 +3144,15 @@ def _pprint_seq(seq, _nest_lvl=0, **kwds): else: fmt = u("[%s]") if hasattr(seq, '__setitem__') else u("(%s)") - nitems = get_option("max_seq_items") or len(seq) + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) s = iter(seq) r = [] for i in range(min(nitems, len(seq))): # handle sets, no slicing - r.append(pprint_thing(next(s), _nest_lvl + 1, **kwds)) + r.append(pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) body = ", ".join(r) if nitems < len(seq): @@ -3111,7 +3163,7 @@ def _pprint_seq(seq, _nest_lvl=0, **kwds): return fmt % body -def _pprint_dict(seq, _nest_lvl=0, **kwds): +def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): """ internal. pprinter for iterables. you should probably use pprint_thing() rather then calling this directly. @@ -3121,11 +3173,14 @@ def _pprint_dict(seq, _nest_lvl=0, **kwds): pfmt = u("%s: %s") - nitems = get_option("max_seq_items") or len(seq) + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: - pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, **kwds), - pprint_thing(v, _nest_lvl + 1, **kwds))) + pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))) if nitems < len(seq): return fmt % (", ".join(pairs) + ", ...") @@ -3134,7 +3189,7 @@ def _pprint_dict(seq, _nest_lvl=0, **kwds): def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, - quote_strings=False): + quote_strings=False, max_seq_items=None): """ This function is the sanctioned way of converting objects to a unicode representation. @@ -3153,6 +3208,8 @@ def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, replacements default_escapes : bool, default False Whether the input escape characters replaces or adds to the defaults + max_seq_items : False, int, default None + Pass thru to other pretty printers to limit sequence printing Returns ------- @@ -3191,11 +3248,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return compat.text_type(thing) elif (isinstance(thing, dict) and _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_dict(thing, _nest_lvl, quote_strings=True) + result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) elif is_sequence(thing) and _nest_lvl < \ get_option("display.pprint_nest_depth"): result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, - quote_strings=quote_strings) + quote_strings=quote_strings, max_seq_items=max_seq_items) elif isinstance(thing, compat.string_types) and quote_strings: if compat.PY3: fmt = "'%s'" @@ -3265,8 +3322,42 @@ def save(obj, path): # TODO remove in 0.13 def _maybe_match_name(a, b): - a_name = getattr(a, 'name', None) - b_name = getattr(b, 'name', None) - if a_name == b_name: - return a_name + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + return None + elif a_has: + return a.name + elif b_has: + return b.name return None + +def _random_state(state=None): + """ + Helper function for processing random_state arguments. + + Parameters + ---------- + state : int, np.random.RandomState, None. + If receives an int, passes to np.random.RandomState() as seed. + If receives an np.random.RandomState object, just returns object. + If receives `None`, returns an np.random.RandomState object. + If receives anything else, raises an informative ValueError. + Default None. + + Returns + ------- + np.random.RandomState + """ + + if is_integer(state): + return np.random.RandomState(state) + elif isinstance(state, np.random.RandomState): + return state + elif state is None: + return np.random.RandomState() + else: + raise ValueError("random_state must be an integer, a numpy RandomState, or None") diff --git a/pandas/core/format.py b/pandas/core/format.py index b21ca9050ffd0..3ab41ded1deea 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -14,15 +14,14 @@ from pandas.core.config import get_option, set_option import pandas.core.common as com import pandas.lib as lib -from pandas.tslib import iNaT, Timestamp, Timedelta - +from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex import numpy as np import itertools import csv -from pandas.tseries.period import PeriodIndex, DatetimeIndex - docstring_to_string = """ Parameters ---------- @@ -613,8 +612,12 @@ def get_col_type(dtype): name = any(self.frame.columns.names) for i, lev in enumerate(self.frame.index.levels): lev2 = lev.format(name=name) - width = len(lev2[0]) - lev3 = [' ' * width] * clevels + lev2 + blank = ' ' * len(lev2[0]) + lev3 = [blank] * clevels + for level_idx, group in itertools.groupby( + self.frame.index.labels[i]): + count = len(list(group)) + lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) strcols.insert(i, lev3) if column_format is None: @@ -773,6 +776,9 @@ def _get_formatted_index(self, frame): formatter=fmt) else: fmt_index = [index.format(name=show_index_names, formatter=fmt)] + fmt_index = [tuple(_make_fixed_width( + list(x), justify='left', minimum=(self.col_space or 0))) + for x in fmt_index] adjoined = adjoin(1, *fmt_index).split('\n') @@ -1255,9 +1261,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes @@ -1266,9 +1273,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, - date_format=date_format) + date_format=date_format, + quoting=self.quoting) else: - cols = list(cols) + cols = np.asarray(list(cols)) # save it self.cols = cols @@ -1367,8 +1375,10 @@ def strftime_with_nulls(x): values = self.obj.copy() values.index = data_index values.columns = values.columns.to_native_types( - na_rep=na_rep, float_format=float_format, - date_format=date_format) + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting) values = values[cols] series = {} @@ -1539,18 +1549,22 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + d = b.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + ix = data_index.to_native_types(slicer=slicer, + na_rep=self.na_rep, float_format=self.float_format, - date_format=self.date_format) + date_format=self.date_format, + quoting=self.quoting) lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) @@ -1996,7 +2010,7 @@ def _format_strings(self): # this is pretty arbitrary for now has_large_values = (abs_vals > 1e8).any() - has_small_values = ((abs_vals < 10 ** (-self.digits)) & + has_small_values = ((abs_vals < 10 ** (-self.digits+1)) & (abs_vals > 0)).any() if too_long and has_large_values: @@ -2026,16 +2040,43 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): self.date_format = date_format def _format_strings(self): - formatter = (self.formatter or - _get_format_datetime64_from_values(self.values, - nat_rep=self.nat_rep, - date_format=self.date_format)) - fmt_values = [formatter(x) for x in self.values] + # we may have a tz, if so, then need to process element-by-element + # when DatetimeBlockWithTimezones is a reality this could be fixed + values = self.values + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + if values.tz is None: + fmt_values = format_array_from_datetime(values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep).reshape(values.shape) + fmt_values = fmt_values.tolist() + + else: + + values = values.asobject + is_dates_only = _is_dates_only(values) + formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) + fmt_values = [ formatter(x) for x in self.values ] return fmt_values +def _is_dates_only(values): + # return a boolean if we are only dates (and don't have a timezone) + values = DatetimeIndex(values) + if values.tz is not None: + return False + + values_int = values.asi8 + consider_values = values_int != iNaT + one_day_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + if even_days: + return True + return False + def _format_datetime64(x, tz=None, nat_rep='NaT'): if x is None or lib.checknull(x): return nat_rep @@ -2058,22 +2099,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): else: return x._date_repr - -def _is_dates_only(values): - # return a boolean if we are only dates (and don't have a timezone) - from pandas import DatetimeIndex - values = DatetimeIndex(values) - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 - if even_days: - return True - return False - def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): if is_dates_only: @@ -2084,13 +2109,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) -def _get_format_datetime64_from_values(values, - nat_rep='NaT', - date_format=None): +def _get_format_datetime64_from_values(values, date_format): + """ given values and a date_format, return a string format """ is_dates_only = _is_dates_only(values) - return _get_format_datetime64(is_dates_only=is_dates_only, - nat_rep=nat_rep, - date_format=date_format) + if is_dates_only: + return date_format or "%Y-%m-%d" + return None class Timedelta64Formatter(GenericArrayFormatter): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f700d4316842c..f36108262432d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,8 +26,9 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, is_sequence, _infer_dtype_from_scalar, _values_from_object, - is_list_like, _get_dtype, _maybe_box_datetimelike, - is_categorical_dtype, is_object_dtype, _possibly_infer_to_datetimelike) + is_list_like, _maybe_box_datetimelike, + is_categorical_dtype, is_object_dtype, + _possibly_infer_to_datetimelike) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, @@ -66,7 +67,7 @@ # Docstring templates _shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', - axes_single_arg="{0,1,'index','columns'}") + axes_single_arg="{0, 1, 'index', 'columns'}") _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -191,6 +192,11 @@ def _constructor(self): _constructor_sliced = Series + @property + def _constructor_expanddim(self): + from pandas.core.panel import Panel + return Panel + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: @@ -260,8 +266,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = self._init_dict({}, index, columns, dtype=dtype) elif isinstance(data, collections.Iterator): raise TypeError("data argument can't be an iterator") else: @@ -657,6 +662,8 @@ def from_dict(cls, data, orient='columns', dtype=None): The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise if the keys should be rows, pass 'index'. + dtype : dtype, default None + Data type to force, otherwise infer Returns ------- @@ -794,10 +801,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls() try: - if compat.PY3: - first_row = next(data) - else: - first_row = next(data) + first_row = next(data) except StopIteration: return cls(index=index, columns=columns) @@ -1064,8 +1068,6 @@ def to_panel(self): ------- panel : Panel """ - from pandas.core.panel import Panel - # only support this kind for now if (not isinstance(self.index, MultiIndex) or # pragma: no cover len(self.index.levels) != 2): @@ -1103,7 +1105,7 @@ def to_panel(self): shape=shape, ref_items=selfsorted.columns) - return Panel(new_mgr) + return self._constructor_expanddim(new_mgr) to_wide = deprecate('to_wide', to_panel) @@ -1244,6 +1246,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', >>> writer.save() """ from pandas.io.excel import ExcelWriter + if self.columns.nlevels > 1: + raise NotImplementedError("Writing as Excel with a MultiIndex is " + "not yet implemented.") need_save = False if encoding == None: @@ -1738,17 +1743,19 @@ def _ixs(self, i, axis=0): lab_slice = slice(label[0], label[-1]) return self.ix[:, lab_slice] else: - label = self.columns[i] if isinstance(label, Index): return self.take(i, axis=1, convert=True) + index_len = len(self.index) + # if the values returned are not the same length # as the index (iow a not found value), iget returns # a 0-len ndarray. This is effectively catching # a numpy error (as numpy should really raise) values = self._data.iget(i) - if not len(values): - values = np.array([np.nan] * len(self.index), dtype=object) + + if index_len and not len(values): + values = np.array([np.nan] * index_len, dtype=object) result = self._constructor_sliced.from_array( values, index=self.index, name=label, fastpath=True) @@ -1835,7 +1842,7 @@ def _getitem_multilevel(self, key): result.columns = result_columns else: new_values = self.values[:, loc] - result = DataFrame(new_values, index=self.index, + result = self._constructor(new_values, index=self.index, columns=result_columns).__finalize__(self) if len(result.columns) == 1: top = result.columns[0] @@ -1843,7 +1850,7 @@ def _getitem_multilevel(self, key): (type(top) == tuple and top[0] == '')): result = result[''] if isinstance(result, Series): - result = Series(result, index=self.index, name=key) + result = self._constructor_sliced(result, index=self.index, name=key) result._set_is_copy(self) return result @@ -2244,10 +2251,11 @@ def assign(self, **kwargs): Notes ----- Since ``kwargs`` is a dictionary, the order of your - arguments may not be preserved, and so the order of the - new columns is not well defined. Assigning multiple - columns within the same ``assign`` is possible, but you cannot - reference other columns created within the same ``assign`` call. + arguments may not be preserved. The make things predicatable, + the columns are inserted in alphabetical order, at the end of + your DataFrame. Assigning multiple columns within the same + ``assign`` is possible, but you cannot reference other columns + created within the same ``assign`` call. Examples -------- @@ -2296,7 +2304,7 @@ def assign(self, **kwargs): results[k] = v # ... and then assign - for k, v in results.items(): + for k, v in sorted(results.items()): data[k] = v return data @@ -2512,6 +2520,19 @@ def rename(self, index=None, columns=None, **kwargs): return super(DataFrame, self).rename(index=index, columns=columns, **kwargs) + @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) + def fillna(self, value=None, method=None, axis=None, inplace=False, + limit=None, downcast=None, **kwargs): + return super(DataFrame, self).fillna(value=value, method=method, + axis=axis, inplace=inplace, + limit=limit, downcast=downcast, + **kwargs) + + @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + def shift(self, periods=1, freq=None, axis=0, **kwargs): + return super(DataFrame, self).shift(periods=periods, freq=freq, + axis=axis, **kwargs) + def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): """ @@ -2724,7 +2745,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, Parameters ---------- - axis : {0, 1}, or tuple/list thereof + axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof Pass tuple or list to drop on multiple axes how : {'any', 'all'} * any : if any NA values are present, drop that label @@ -2869,7 +2890,7 @@ def sort(self, columns=None, axis=0, ascending=True, ascending : boolean or list, default True Sort ascending vs. descending. Specify list for multiple sort orders - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -2898,7 +2919,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, Parameters ---------- - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns by : object Column name(s) in frame. Accepts a column name or a list @@ -3006,7 +3027,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, Parameters ---------- level : int - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 ascending : boolean, default True inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -3583,7 +3604,7 @@ def unstack(self, level=-1): #---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1): + def diff(self, periods=1, axis=0): """ 1st discrete difference of object @@ -3591,12 +3612,14 @@ def diff(self, periods=1): ---------- periods : int, default 1 Periods to shift for forming difference + axis : {0 or 'index', 1 or 'columns'}, default 0 Returns ------- diffed : DataFrame """ - new_data = self._data.diff(n=periods) + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.diff(n=periods, axis=bm_axis) return self._constructor(new_data) #---------------------------------------------------------------------- @@ -3616,9 +3639,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, ---------- func : function Function to apply to each column/row - axis : {0, 1} - * 0 : apply function to each column - * 1 : apply function to each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index': apply function to each column + * 1 or 'columns': apply function to each row broadcast : boolean, default False For aggregation functions, return object of same size with values propagated @@ -4139,8 +4162,8 @@ def corrwith(self, other, axis=0, drop=False): Parameters ---------- other : DataFrame - axis : {0, 1} - 0 to compute column-wise, 1 for row-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise drop : boolean, default False Drop missing indices from result, default returns union of all @@ -4191,8 +4214,8 @@ def count(self, axis=0, level=None, numeric_only=False): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame @@ -4345,8 +4368,8 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA @@ -4376,8 +4399,8 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be first index. @@ -4413,19 +4436,19 @@ def mode(self, axis=0, numeric_only=False): """ Gets the mode(s) of each element along the axis selected. Empty if nothing has 2+ occurrences. Adds a row for each mode per label, fills in gaps - with nan. - + with nan. + Note that there could be multiple values returned for the selected - axis (when more than one item share the maximum frequency), which is the - reason why a dataframe is returned. If you want to impute missing values - with the mode in a dataframe ``df``, you can just do this: + axis (when more than one item share the maximum frequency), which is the + reason why a dataframe is returned. If you want to impute missing values + with the mode in a dataframe ``df``, you can just do this: ``df.fillna(df.mode().iloc[0])`` Parameters ---------- - axis : {0, 1, 'index', 'columns'} (default 0) - * 0/'index' : get mode of each column - * 1/'columns' : get mode of each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row numeric_only : boolean, default False if True, only apply to numeric columns @@ -4530,7 +4553,7 @@ def rank(self, axis=0, numeric_only=None, method='average', Parameters ---------- - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data @@ -4582,7 +4605,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period vs. end - axis : {0, 1} default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If false then underlying input data is not copied @@ -4613,7 +4636,7 @@ def to_period(self, freq=None, axis=0, copy=True): Parameters ---------- freq : string, default - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If False then underlying input data is not copied diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 012a73fac1ef4..d6c7d87bb25b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -17,11 +17,12 @@ import pandas.core.common as com import pandas.core.datetools as datetools from pandas import compat -from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap +from pandas.compat import map, zip, lrange, string_types, isidentifier from pandas.core.common import (isnull, notnull, is_list_like, _values_from_object, _maybe_promote, _maybe_box_datetimelike, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning) + SettingWithCopyError, SettingWithCopyWarning, + AbstractMethodError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -137,7 +138,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - raise NotImplementedError + raise AbstractMethodError(self) def __unicode__(self): # unicode representation based upon iterating over self @@ -145,13 +146,17 @@ def __unicode__(self): prepr = '[%s]' % ','.join(map(com.pprint_thing, self)) return '%s(%s)' % (self.__class__.__name__, prepr) - def _local_dir(self): + def _dir_additions(self): """ add the string-like attributes from the info_axis """ - return [c for c in self._info_axis - if isinstance(c, string_types) and isidentifier(c)] + return set([c for c in self._info_axis + if isinstance(c, string_types) and isidentifier(c)]) @property def _constructor_sliced(self): + raise AbstractMethodError(self) + + @property + def _constructor_expanddim(self): raise NotImplementedError #---------------------------------------------------------------------- @@ -1100,7 +1105,7 @@ def _iget_item_cache(self, item): return lower def _box_item_values(self, key, values): - raise NotImplementedError + raise AbstractMethodError(self) def _maybe_cache_changed(self, item, value): """ @@ -1557,7 +1562,7 @@ def reindex_like(self, other, method=None, copy=True, limit=None): return self.reindex(**d) - def drop(self, labels, axis=0, level=None, inplace=False): + def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): """ Return new object with labels in requested axis removed @@ -1569,6 +1574,8 @@ def drop(self, labels, axis=0, level=None, inplace=False): For MultiIndex inplace : bool, default False If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- @@ -1582,9 +1589,9 @@ def drop(self, labels, axis=0, level=None, inplace=False): if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') - new_axis = axis.drop(labels, level=level) + new_axis = axis.drop(labels, level=level, errors=errors) else: - new_axis = axis.drop(labels) + new_axis = axis.drop(labels, errors=errors) dropped = self.reindex(**{axis_name: new_axis}) try: dropped.axes[axis_].set_names(axis.names, inplace=True) @@ -1941,6 +1948,103 @@ def tail(self, n=5): return self return self.iloc[-n:] + + def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): + """ + Returns a random sample of items from an axis of object. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. + replace : boolean, optional + Sample with or without replacement. Default = False. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Weights must be same length as axis being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + inf and -inf values not allowed. + random_state : int or numpy.random.RandomState, optional + Seed for the random number generator (if int), or numpy RandomState + object. + axis : int or string, optional + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames, 1 for Panels). + + Returns + ------- + Same type as caller. + """ + + if axis is None: + axis = self._stat_axis_number + + axis = self._get_axis_number(axis) + axis_length = self.shape[axis] + + # Process random_state argument + rs = com._random_state(random_state) + + # Check weights for compliance + if weights is not None: + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, string_types): + if isinstance(self, pd.DataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError: + raise KeyError("String passed to weights not a valid column") + else: + raise ValueError("Strings can only be passed to weights when sampling from rows on a DataFrame") + else: + raise ValueError("Strings cannot be passed as weights when sampling from a Series or Panel.") + + weights = pd.Series(weights, dtype='float64') + + if len(weights) != axis_length: + raise ValueError("Weights and axis to be sampled must be of same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + weights = weights / weights.sum() + + weights = weights.values + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif n is not None and frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + raise ValueError('Please enter a value for `frac` OR `n`, not both') + + # Check for negative sizes + if n < 0: + raise ValueError("A negative number of rows requested. Please provide positive value.") + + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) + + #---------------------------------------------------------------------- # Attribute access @@ -2261,19 +2365,23 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, Parameters ---------- - convert_dates : if True, attempt to soft convert dates, if 'coerce', - force conversion (and non-convertibles get NaT) - convert_numeric : if True attempt to coerce to numbers (including - strings), non-convertibles get NaN - convert_timedeltas : if True, attempt to soft convert timedeltas, if 'coerce', - force conversion (and non-convertibles get NaT) - copy : Boolean, if True, return copy even if no copy is necessary - (e.g. no conversion was done), default is True. - It is meant for internal use, not to be confused with `inplace` kw. + convert_dates : boolean, default True + If True, convert to date where possible. If 'coerce', force + conversion, with unconvertible values becoming NaT. + convert_numeric : boolean, default False + If True, attempt to coerce to numbers (including strings), with + unconvertible values becoming NaN. + convert_timedeltas : boolean, default True + If True, convert to timedelta where possible. If 'coerce', force + conversion, with unconvertible values becoming NaT. + copy : boolean, default True + If True, return a copy even if no copy is necessary (e.g. no + conversion was done). Note: This is meant for internal use, and + should not be confused with inplace. Returns ------- - converted : asm as input object + converted : same as input object """ return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -2284,31 +2392,33 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, #---------------------------------------------------------------------- # Filling NA's - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None): + _shared_docs['fillna'] = ( """ Fill NA/NaN values using the specified method Parameters ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). (values not in the dict/Series/DataFrame will not be filled). This value cannot be a list. - axis : {0, 1}, default 0 - * 0: fill column-by-column - * 1: fill row-by-row + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + axis : %(axes_single_arg)s inplace : boolean, default False If True, fill in place. Note: this will modify any other views on this object, (e.g. a no-copy slice for a column in a DataFrame). limit : int, default None - Maximum size gap to forward or backward fill + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. downcast : dict, default is None a dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate @@ -2320,8 +2430,13 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, Returns ------- - filled : same type as caller + filled : %(klass)s """ + ) + + @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) + def fillna(self, value=None, method=None, axis=None, inplace=False, + limit=None, downcast=None): if isinstance(value, (list, tuple)): raise TypeError('"value" parameter must be a scalar or dict, but ' 'you passed a "{0}"'.format(type(value).__name__)) @@ -2724,7 +2839,8 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, Examples -------- - # Filling in NaNs: + Filling in NaNs + >>> s = pd.Series([0, 1, np.nan, 3]) >>> s.interpolate() 0 0 @@ -2810,37 +2926,77 @@ def notnull(self): """ return notnull(self).__finalize__(self) - def clip(self, lower=None, upper=None, out=None): + def clip(self, lower=None, upper=None, out=None, axis=None): """ Trim values at input threshold(s) Parameters ---------- - lower : float, default None - upper : float, default None + lower : float or array_like, default None + upper : float or array_like, default None + axis : int or string axis name, optional + Align object with lower and upper along the given axis. Returns ------- clipped : Series + + Examples + -------- + >>> df + 0 1 + 0 0.335232 -1.256177 + 1 -1.367855 0.746646 + 2 0.027753 -1.176076 + 3 0.230930 -0.679613 + 4 1.261967 0.570967 + >>> df.clip(-1.0, 0.5) + 0 1 + 0 0.335232 -1.000000 + 1 -1.000000 0.500000 + 2 0.027753 -1.000000 + 3 0.230930 -0.679613 + 4 0.500000 0.500000 + >>> t + 0 -0.3 + 1 -0.2 + 2 -0.1 + 3 0.0 + 4 0.1 + dtype: float64 + >>> df.clip(t, t + 1, axis=0) + 0 1 + 0 0.335232 -0.300000 + 1 -0.200000 0.746646 + 2 0.027753 -0.100000 + 3 0.230930 0.000000 + 4 1.100000 0.570967 """ if out is not None: # pragma: no cover raise Exception('out argument is not supported yet') # GH 2747 (arguments were reversed) if lower is not None and upper is not None: - lower, upper = min(lower, upper), max(lower, upper) + if lib.isscalar(lower) and lib.isscalar(upper): + lower, upper = min(lower, upper), max(lower, upper) result = self if lower is not None: - result = result.clip_lower(lower) + result = result.clip_lower(lower, axis) if upper is not None: - result = result.clip_upper(upper) + result = result.clip_upper(upper, axis) return result - def clip_upper(self, threshold): + def clip_upper(self, threshold, axis=None): """ - Return copy of input with values above given value truncated + Return copy of input with values above given value(s) truncated + + Parameters + ---------- + threshold : float or array_like + axis : int or string axis name, optional + Align object with threshold along the given axis. See also -------- @@ -2850,14 +3006,21 @@ def clip_upper(self, threshold): ------- clipped : same type as input """ - if isnull(threshold): + if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") - return self.where((self <= threshold) | isnull(self), threshold) + subset = self.le(threshold, axis=axis) | isnull(self) + return self.where(subset, threshold, axis=axis) - def clip_lower(self, threshold): + def clip_lower(self, threshold, axis=None): """ - Return copy of the input with values below given value truncated + Return copy of the input with values below given value(s) truncated + + Parameters + ---------- + threshold : float or array_like + axis : int or string axis name, optional + Align object with threshold along the given axis. See also -------- @@ -2867,10 +3030,11 @@ def clip_lower(self, threshold): ------- clipped : same type as input """ - if isnull(threshold): + if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") - return self.where((self >= threshold) | isnull(self), threshold) + subset = self.ge(threshold, axis=axis) | isnull(self) + return self.where(subset, threshold, axis=axis) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False): @@ -2903,13 +3067,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Examples -------- - # DataFrame result - >>> data.groupby(func, axis=0).mean() + DataFrame results - # DataFrame result + >>> data.groupby(func, axis=0).mean() >>> data.groupby(['col1', 'col2'])['col3'].mean() - # DataFrame with hierarchical index + DataFrame with hierarchical index + >>> data.groupby(['col1', 'col2']).mean() Returns @@ -3051,7 +3215,8 @@ def first(self, offset): """ from pandas.tseries.frequencies import to_offset if not isinstance(self.index, DatetimeIndex): - raise NotImplementedError + raise NotImplementedError("'first' only supports a DatetimeIndex " + "index") if len(self.index) == 0: return self @@ -3085,7 +3250,8 @@ def last(self, offset): """ from pandas.tseries.frequencies import to_offset if not isinstance(self.index, DatetimeIndex): - raise NotImplementedError + raise NotImplementedError("'last' only supports a DatetimeIndex " + "index") if len(self.index) == 0: return self @@ -3199,11 +3365,10 @@ def _align_series(self, other, join='outer', axis=None, level=None, level=level, return_indexers=True) - left_result = self._reindex_indexer(join_index, lidx, copy) - right_result = other._reindex_indexer(join_index, ridx, copy) + left = self._reindex_indexer(join_index, lidx, copy) + right = other._reindex_indexer(join_index, ridx, copy) else: - # one has > 1 ndim fdata = self._data if axis == 0: @@ -3233,23 +3398,19 @@ def _align_series(self, other, join='outer', axis=None, level=None, if copy and fdata is self._data: fdata = fdata.copy() - left_result = DataFrame(fdata) + left = DataFrame(fdata) if ridx is None: - right_result = other + right = other else: - right_result = other.reindex(join_index, level=level) + right = other.reindex(join_index, level=level) # fill fill_na = notnull(fill_value) or (method is not None) if fill_na: - return (left_result.fillna(fill_value, method=method, limit=limit, - axis=fill_axis), - right_result.fillna(fill_value, method=method, - limit=limit)) - else: - return (left_result.__finalize__(self), - right_result.__finalize__(other)) + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) + right = right.fillna(fill_value, method=method, limit=limit) + return (left.__finalize__(self), right.__finalize__(other)) _shared_docs['where'] = (""" Return an object of same shape as self and whose corresponding @@ -3326,11 +3487,20 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, except ValueError: new_other = np.array(other) - if not (new_other == np.array(other)).all(): - other = np.array(other) + matches = (new_other == np.array(other)) + if matches is False or not matches.all(): - # we can't use our existing dtype - # because of incompatibilities + # coerce other to a common dtype if we can + if com.needs_i8_conversion(self.dtype): + try: + other = np.array(other, dtype=self.dtype) + except: + other = np.array(other) + else: + other = np.asarray(other) + other = np.asarray(other, dtype=np.common_type(other, new_other)) + + # we need to use the new dtype try_quick = False else: other = new_other @@ -3409,8 +3579,7 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, return self.where(~cond, other=other, inplace=inplace, axis=axis, level=level, try_cast=try_cast, raise_on_error=raise_on_error) - def shift(self, periods=1, freq=None, axis=0, **kwargs): - """ + _shared_docs['shift'] = (""" Shift index by desired number of periods with an optional time freq Parameters @@ -3420,6 +3589,7 @@ def shift(self, periods=1, freq=None, axis=0, **kwargs): freq : DateOffset, timedelta, or time rule string, optional Increment to use from datetools module or time rule (e.g. 'EOM'). See Notes. + axis : %(axes_single_arg)s Notes ----- @@ -3429,8 +3599,10 @@ def shift(self, periods=1, freq=None, axis=0, **kwargs): Returns ------- - shifted : same type as caller - """ + shifted : %(klass)s + """) + @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + def shift(self, periods=1, freq=None, axis=0, **kwargs): if periods == 0: return self @@ -3589,8 +3761,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): def tz_convert(self, tz, axis=0, level=None, copy=True): """ - Convert the axis to target time zone. If it is time zone naive, it - will be localized to the passed time zone. + Convert tz-aware axis to target time zone. Parameters ---------- @@ -3604,6 +3775,11 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): Returns ------- + + Raises + ------ + TypeError + If the axis is tz-naive. """ axis = self._get_axis_number(axis) ax = self._get_axis(axis) @@ -3662,6 +3838,11 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, Returns ------- + + Raises + ------ + TypeError + If the TimeSeries is tz-aware and tz is not None. """ axis = self._get_axis_number(axis) ax = self._get_axis(axis) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6d98b3b99021b..51674bad60f5b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,7 +14,7 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes +from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -25,7 +25,8 @@ notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, _values_from_object, - is_datetime_or_timedelta_dtype, is_bool_dtype) + is_datetime_or_timedelta_dtype, is_bool, + is_bool_dtype, AbstractMethodError) from pandas.core.config import option_context import pandas.lib as lib from pandas.lib import Timestamp @@ -186,7 +187,7 @@ class Grouper(object): Examples -------- - >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A') + >>> df.groupby(Grouper(key='A')) : syntactic sugar for df.groupby('A') >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date' >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) : specify a resample on the level 'date' on the columns axis with a frequency of 60s @@ -279,7 +280,10 @@ def _set_grouper(self, obj, sort=False): return self.grouper def _get_binner_for_grouping(self, obj): - raise NotImplementedError + """ default to the standard binner here """ + group_axis = obj._get_axis(self.axis) + return Grouping(group_axis, None, obj=obj, name=self.key, + level=self.level, sort=self.sort, in_axis=False) @property def groups(self): @@ -422,7 +426,11 @@ def convert(key, s): return Timestamp(key).asm8 return key - sample = next(iter(self.indices)) + if len(self.indices) > 0: + sample = next(iter(self.indices)) + else: + sample = None # Dummy sample + if isinstance(sample, tuple): if not isinstance(name, tuple): msg = ("must supply a tuple to get_group with multiple" @@ -490,15 +498,15 @@ def _set_result_index_ordered(self, result): # shortcut of we have an already ordered grouper if not self.grouper.is_monotonic: - index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) + index = Index(np.concatenate([ indices.get(v, []) for v in self.grouper.result_index])) result.index = index result = result.sort_index() result.index = self.obj.index return result - def _local_dir(self): - return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) + def _dir_additions(self): + return self.obj._dir_additions() | self._apply_whitelist def __getattr__(self, attr): if attr in self._internal_names_set: @@ -670,7 +678,7 @@ def _python_apply_general(self, f): not_indexed_same=mutated) def aggregate(self, func, *args, **kwargs): - raise NotImplementedError + raise AbstractMethodError(self) @Appender(_agg_doc) def agg(self, func, *args, **kwargs): @@ -680,7 +688,7 @@ def _iterate_slices(self): yield self.name, self._selected_obj def transform(self, func, *args, **kwargs): - raise NotImplementedError + raise AbstractMethodError(self) def mean(self): """ @@ -1127,7 +1135,7 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output) def _wrap_applied_output(self, *args, **kwargs): - raise NotImplementedError + raise AbstractMethodError(self) def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.tools.merge import concat @@ -1484,13 +1492,16 @@ def aggregate(self, values, how, axis=0): swapped = True values = values.swapaxes(0, axis) if arity > 1: - raise NotImplementedError + raise NotImplementedError("arity of more than 1 is not " + "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] is_numeric = is_numeric_dtype(values.dtype) if is_datetime_or_timedelta_dtype(values.dtype): values = values.view('int64') + # GH 7754 + is_numeric = True elif is_bool_dtype(values.dtype): values = _algos.ensure_float64(values) elif com.is_integer_dtype(values): @@ -1556,7 +1567,8 @@ def _aggregate(self, result, counts, values, agg_func, is_numeric): comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now - raise NotImplementedError + raise NotImplementedError("number of dimensions is currently " + "limited to 3") elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): @@ -1777,12 +1789,14 @@ def size(self): Compute group sizes """ - base = Series(np.zeros(len(self.result_index), dtype=np.int64), - index=self.result_index) + index = self.result_index + base = Series(np.zeros(len(index), dtype=np.int64), index=index) indices = self.indices for k, v in compat.iteritems(indices): indices[k] = len(v) bin_counts = Series(indices, dtype=np.int64) + # make bin_counts.index to have same name to preserve it + bin_counts.index.name = index.name result = base.add(bin_counts, fill_value=0) # addition with fill_value changes dtype to float64 result = result.astype(np.int64) @@ -1815,7 +1829,8 @@ def _aggregate(self, result, counts, values, agg_func, is_numeric=True): if values.ndim > 3: # punting for now - raise NotImplementedError + raise NotImplementedError("number of dimensions is currently " + "limited to 3") elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): agg_func(result[:, :, i], counts, chunk, self.bins) @@ -1922,7 +1937,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = com._asarray_tuplesafe(self.grouper) # a passed Categorical - elif isinstance(self.grouper, Categorical): + elif is_categorical_dtype(self.grouper): # must have an ordered categorical if self.sort: @@ -1936,8 +1951,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # fix bug #GH8868 sort=False being ignored in categorical groupby else: self.grouper = self.grouper.reorder_categories(self.grouper.unique()) + + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes self._labels = self.grouper.codes - self._group_index = self.grouper.categories + + c = self.grouper.categories + self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)), + categories=c, + ordered=self.grouper.ordered)) if self.name is None: self.name = self.grouper.name @@ -1951,8 +1973,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if self.name is None: self.name = grouper.name + # we are done + if isinstance(self.grouper, Grouping): + self.grouper = self.grouper.grouper + # no level passed - if not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)): + elif not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) @@ -2125,8 +2151,8 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if isinstance(gpr, Categorical) and len(gpr) != len(obj): - raise ValueError("Categorical grouper must have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != len(obj): + raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)") ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort, in_axis=in_axis) @@ -2423,6 +2449,8 @@ def transform(self, func, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) for i, (name, group) in enumerate(self): + if name not in self.indices: + continue object.__setattr__(group, 'name', name) res = wrapper(group) @@ -2438,7 +2466,7 @@ def transform(self, func, *args, **kwargs): except: pass - indexer = self._get_index(name) + indexer = self.indices[name] result[indexer] = res result = _possibly_downcast_to_dtype(result, dtype) @@ -2452,9 +2480,12 @@ def _transform_fast(self, func): """ if isinstance(func, compat.string_types): func = getattr(self,func) + values = func().values - counts = self.size().values + counts = self.size().fillna(0).values values = np.repeat(values, com._ensure_platform_int(counts)) + if any(counts == 0): + values = self._try_cast(values, self._selected_obj) return self._set_result_index_ordered(Series(values)) @@ -2489,8 +2520,11 @@ def true_and_notnull(x, *args, **kwargs): return b and notnull(b) try: - indices = [self._get_index(name) if true_and_notnull(group) else [] - for name, group in self] + indices = [] + for name, group in self: + if true_and_notnull(group) and name in self.indices: + indices.append(self.indices[name]) + except ValueError: raise TypeError("the filter must return a boolean result") except TypeError: @@ -2622,7 +2656,8 @@ def aggregate(self, arg, *args, **kwargs): if self._selection is not None: subset = obj if isinstance(subset, DataFrame): - raise NotImplementedError + raise NotImplementedError("Aggregating on a DataFrame is " + "not supported") for fname, agg_how in compat.iteritems(arg): colg = SeriesGroupBy(subset, selection=self._selection, @@ -2671,7 +2706,7 @@ def _aggregate_multiple_funcs(self, arg): from pandas.tools.merge import concat if self.axis != 0: - raise NotImplementedError + raise NotImplementedError("axis other than 0 is not supported") obj = self._obj_with_exclusions @@ -2721,7 +2756,7 @@ def _aggregate_generic(self, func, *args, **kwargs): return self._wrap_generic_output(result, obj) def _wrap_aggregated_output(self, output, names=None): - raise NotImplementedError + raise AbstractMethodError(self) def _aggregate_item_by_item(self, func, *args, **kwargs): # only for axis==0 @@ -2808,7 +2843,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if com._count_not_none(*values) != len(values): - v = next(v for v in values if v is not None) + try: + v = next(v for v in values if v is not None) + except StopIteration: + # If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. + return DataFrame() if v is None: return DataFrame() elif isinstance(v, NDFrame): @@ -3001,24 +3041,18 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - # a grouped that doesn't preserve the index, remap index based on the grouper - # and broadcast it - if ((not isinstance(obj.index,MultiIndex) and - type(result.index) != type(obj.index)) or - len(result.index) != len(obj.index)): - results = obj.values.copy() - indices = self.indices - for (name, group), (i, row) in zip(self, result.iterrows()): + results = np.empty_like(obj.values, result.values.dtype) + indices = self.indices + for (name, group), (i, row) in zip(self, result.iterrows()): + if name in indices: indexer = indices[name] results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1) - return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() - # we can merge the result in - # GH 7383 - names = result.columns - result = obj.merge(result, how='outer', left_index=True, right_index=True).iloc[:,-result.shape[1]:] - result.columns = names - return result + counts = self.size().fillna(0).values + if any(counts == 0): + results = self._try_cast(results, obj[result.columns]) + + return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): @@ -3110,10 +3144,9 @@ def filter(self, func, dropna=True, *args, **kwargs): pass # interpret the result of the filter - if (isinstance(res, (bool, np.bool_)) or - np.isscalar(res) and isnull(res)): - if res and notnull(res): - indices.append(self._get_index(name)) + if is_bool(res) or (lib.isscalar(res) and isnull(res)): + if res and notnull(res) and name in self.indices: + indices.append(self.indices[name]) else: # non scalars aren't allowed raise TypeError("filter function returned a %s, " @@ -3245,7 +3278,7 @@ def _reindex_output(self, result): return result elif len(groupings) == 1: return result - elif not any([isinstance(ping.grouper, Categorical) + elif not any([isinstance(ping.grouper, (Categorical, CategoricalIndex)) for ping in groupings]): return result @@ -3283,7 +3316,7 @@ def _iterate_slices(self): slice_axis = self._selection_list slicer = lambda x: self._selected_obj[x] else: - raise NotImplementedError + raise NotImplementedError("axis other than 0 is not supported") for val in slice_axis: if val in self.exclusions: @@ -3348,10 +3381,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): new_axes[self.axis] = self.grouper.result_index return Panel._from_axes(result, new_axes) else: - raise NotImplementedError + raise ValueError("axis value must be greater than 0") def _wrap_aggregated_output(self, output, names=None): - raise NotImplementedError + raise AbstractMethodError(self) class NDArrayGroupBy(GroupBy): @@ -3405,7 +3438,7 @@ def _chop(self, sdata, slice_obj): return sdata.iloc[slice_obj] def apply(self, f): - raise NotImplementedError + raise AbstractMethodError(self) class ArraySplitter(DataSplitter): diff --git a/pandas/core/index.py b/pandas/core/index.py index e335d00551bab..2bd96fcec2e42 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2,6 +2,7 @@ import datetime import warnings import operator + from functools import partial from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat @@ -13,20 +14,22 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate) -from pandas.core.common import isnull, array_equivalent import pandas.core.common as com -from pandas.core.common import (_values_from_object, is_float, is_integer, - ABCSeries, _ensure_object, _ensure_int64, is_bool_indexer, +from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, + _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, + ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning # simplify -default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'), - quote_strings=True) +default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x, + escape_chars=('\t', '\r', '\n'), + quote_strings=True, + max_seq_items=max_seq_items) __all__ = ['Index'] @@ -44,27 +47,6 @@ def _try_get_item(x): except AttributeError: return x -def _indexOp(opname): - """ - Wrapper function for index comparison operations, to avoid - code duplication. - """ - - def wrapper(self, other): - func = getattr(self._data.view(np.ndarray), opname) - result = func(np.asarray(other)) - - # technically we could support bool dtyped Index - # for now just return the indexing array directly - if is_bool_dtype(result): - return result - try: - return Index(result) - except: # pragma: no cover - return result - return wrapper - - class InvalidIndexError(Exception): pass @@ -163,6 +145,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) else: subarr = com._asarray_tuplesafe(data, dtype=object) @@ -171,6 +155,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if copy: subarr = subarr.copy() + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -259,7 +245,7 @@ def __len__(self): """ return len(self._data) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self._data.view(np.ndarray) @@ -283,9 +269,6 @@ def get_values(self): """ return the underlying data as an ndarray """ return self.values - def _array_values(self): - return self._data - # ops compat def tolist(self): """ @@ -411,9 +394,150 @@ def __unicode__(self): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) - return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + klass = self.__class__.__name__ + data = self._format_data() + attrs = self._format_attrs() + space = self._format_space() + + prepr = (u(",%s") % space).join([u("%s=%s") % (k, v) + for k, v in attrs]) + + # no data provided, just attributes + if data is None: + data = '' + + res = u("%s(%s%s)") % (klass, + data, + prepr) + + return res + + def _format_space(self): + + # using space here controls if the attributes + # are line separated or not (the default) + + #max_seq_items = get_option('display.max_seq_items') + #if len(self) > max_seq_items: + # space = "\n%s" % (' ' * (len(klass) + 1)) + return " " + + @property + def _formatter_func(self): + """ + Return the formatted data as a unicode string + """ + return default_pprint + + def _format_data(self): + """ + Return the formatted data as a unicode string + """ + from pandas.core.format import get_console_size + display_width, _ = get_console_size() + if display_width is None: + display_width = get_option('display.width') or 80 + + space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + + n = len(self) + sep = ',' + max_seq_items = get_option('display.max_seq_items') + formatter = self._formatter_func + + # do we want to justify (only do so for non-objects) + is_justify = not (self.inferred_type == 'string' or self.inferred_type == 'categorical' and is_object_dtype(self.categories)) + + # are we a truncated display + is_truncated = n > max_seq_items + + def _extend_line(s, line, value, display_width, next_line_prefix): + + if len(line.rstrip()) + len(value.rstrip()) >= display_width: + s += line.rstrip() + line = next_line_prefix + line += value + return s, line + + def best_len(values): + if values: + return max([len(x) for x in values]) + else: + return 0 + + if n == 0: + summary = '[], ' + elif n == 1: + first = formatter(self[0]) + summary = '[%s], ' % first + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[%s, %s], ' % (first, last) + else: + + if n > max_seq_items: + n = min(max_seq_items//2,10) + head = [ formatter(x) for x in self[:n] ] + tail = [ formatter(x) for x in self[-n:] ] + else: + head = [] + tail = [ formatter(x) for x in self ] + + # adjust all values to max length if needed + if is_justify: + + # however, if we are not truncated and we are only a single line, then don't justify + if is_truncated or not (len(', '.join(head)) < display_width and len(', '.join(tail)) < display_width): + max_len = max(best_len(head), best_len(tail)) + head = [x.rjust(max_len) for x in head] + tail = [x.rjust(max_len) for x in tail] + + summary = "" + line = space2 + + for i in range(len(head)): + word = head[i] + sep + ' ' + summary, line = _extend_line(summary, line, word, + display_width, space2) + if is_truncated: + summary += line + space2 + '...' + line = space2 + + for i in range(len(tail)-1): + word = tail[i] + sep + ' ' + summary, line = _extend_line(summary, line, word, + display_width, space2) + + # last value: no sep added + 1 space of width used for trailing ',' + summary, line = _extend_line(summary, line, tail[-1], + display_width - 2, space2) + summary += line + summary += '],' + + if len(summary) > (display_width): + summary += space1 + else: # one row + summary += ' ' + + # remove initial space + summary = '[' + summary[len(space2):] + + return summary + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + attrs = [] + attrs.append(('dtype',"'%s'" % self.dtype)) + if self.name is not None: + attrs.append(('name',default_pprint(self.name))) + max_seq_items = get_option('display.max_seq_items') + if len(self) > max_seq_items: + attrs.append(('length',len(self))) + return attrs def to_series(self, **kwargs): """ @@ -430,9 +554,10 @@ def to_series(self, **kwargs): def _to_embed(self, keep_tz=False): """ + *this is an internal non-public method* + return an array repr of this object, potentially casting to object - This is for internal compat """ return self.values @@ -455,8 +580,18 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): + if not com.is_list_like(other): + raise TypeError('Input must be Index or array-like') return True + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = self.name if self.name == other.name else None + return other, result_name + @property def nlevels(self): return 1 @@ -624,7 +759,10 @@ def is_numeric(self): return self.inferred_type in ['integer', 'floating'] def is_object(self): - return self.dtype == np.object_ + return is_object_dtype(self.dtype) + + def is_categorical(self): + return self.inferred_type in ['categorical'] def is_mixed(self): return 'mixed' in self.inferred_type @@ -773,14 +911,11 @@ def is_int(v): return indexer - def _convert_list_indexer(self, key, kind=None): - """ convert a list indexer. these should be locations """ - return key - - def _convert_list_indexer_for_mixed(self, keyarr, kind=None): - """ passed a key that is tuplesafe that is integer based - and we have a mixed index (e.g. number/labels). figure out - the indexer. return None if we can't help + def _convert_list_indexer(self, keyarr, kind=None): + """ + passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help """ if (kind is None or kind in ['iloc','ix']) and (is_integer_dtype(keyarr) and not self.is_floating()): if self.inferred_type != 'integer': @@ -955,17 +1090,13 @@ def __getitem__(self, key): else: return result - def append(self, other): + def _ensure_compat_append(self, other): """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices + prepare the append Returns ------- - appended : Index + list of to_concat, name of result Index """ name = self.name to_concat = [self] @@ -976,14 +1107,30 @@ def append(self, other): to_concat.append(other) for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: + if (isinstance(obj, Index) and + obj.name != name and + obj.name is not None): name = None break to_concat = self._ensure_compat_concat(to_concat) to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] + return to_concat, name + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + Returns + ------- + appended : Index + """ + to_concat, name = self._ensure_compat_append(other) return Index(np.concatenate(to_concat), name=name) @staticmethod @@ -1045,10 +1192,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): from pandas.core.format import format_array - if values.dtype == np.object_: + if is_categorical_dtype(values.dtype): + values = np.array(values) + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) - if values.dtype == np.object_: + if is_object_dtype(values.dtype): result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n')) for x in values] @@ -1070,12 +1219,16 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep='', **kwargs): + def _format_native_types(self, na_rep='', quoting=None, **kwargs): """ actually format my specific types """ mask = isnull(self) - values = np.array(self, dtype=object, copy=True) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + values[mask] = na_rep - return values.tolist() + return values def equals(self, other): """ @@ -1087,9 +1240,6 @@ def equals(self, other): if not isinstance(other, Index): return False - if type(other) != Index: - return other.equals(self) - return array_equivalent(_values_from_object(self), _values_from_object(other)) def identical(self, other): @@ -1183,26 +1333,26 @@ def argsort(self, *args, **kwargs): return result.argsort(*args, **kwargs) def __add__(self, other): - if isinstance(other, Index): + if com.is_list_like(other): warnings.warn("using '+' to provide set union with Indexes is deprecated, " - "use '|' or .union()",FutureWarning) + "use '|' or .union()", FutureWarning) + if isinstance(other, Index): return self.union(other) return Index(np.array(self) + other) + + def __radd__(self, other): + if com.is_list_like(other): + warnings.warn("using '+' to provide set union with Indexes is deprecated, " + "use '|' or .union()", FutureWarning) + return Index(other + np.array(self)) + __iadd__ = __add__ def __sub__(self, other): - if isinstance(other, Index): - warnings.warn("using '-' to provide set differences with Indexes is deprecated, " - "use .difference()",FutureWarning) + warnings.warn("using '-' to provide set differences with Indexes is deprecated, " + "use .difference()",FutureWarning) return self.difference(other) - __eq__ = _indexOp('__eq__') - __ne__ = _indexOp('__ne__') - __lt__ = _indexOp('__lt__') - __gt__ = _indexOp('__gt__') - __le__ = _indexOp('__le__') - __ge__ = _indexOp('__ge__') - def __and__(self, other): return self.intersection(other) @@ -1224,18 +1374,16 @@ def union(self, other): ------- union : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable.') + self._assert_can_do_setop(other) + other = _ensure_index(other) if len(other) == 0 or self.equals(other): return self if len(self) == 0: - return _ensure_index(other) + return other - self._assert_can_do_setop(other) - - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) @@ -1299,17 +1447,13 @@ def intersection(self, other): ------- intersection : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - self._assert_can_do_setop(other) - other = _ensure_index(other) if self.equals(other): return self - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) @@ -1352,23 +1496,17 @@ def difference(self, other): >>> index.difference(index2) """ - - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') + self._assert_can_do_setop(other) if self.equals(other): return Index([], name=self.name) - if not isinstance(other, Index): - other = np.asarray(other) - result_name = self.name - else: - result_name = self.name if self.name == other.name else None + other, result_name = self._convert_can_do_setop(other) theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) - diff = deprecate('diff',difference) + diff = deprecate('diff', difference) def sym_diff(self, other, result_name=None): """ @@ -1377,7 +1515,7 @@ def sym_diff(self, other, result_name=None): Parameters ---------- - other : array-like + other : Index or array-like result_name : str Returns @@ -1405,13 +1543,10 @@ def sym_diff(self, other, result_name=None): >>> idx1 ^ idx2 Int64Index([1, 5], dtype='int64') """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - - if not isinstance(other, Index): - other = Index(other) - result_name = result_name or self.name - + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) @@ -1468,7 +1603,7 @@ def get_value(self, series, key): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -1543,7 +1678,7 @@ def get_indexer(self, target, method=None, limit=None): if pself is not self or ptarget is not target: return pself.get_indexer(ptarget, method=method, limit=limit) - if self.dtype != target.dtype: + if not is_dtype_equal(self.dtype,target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit) @@ -1642,7 +1777,8 @@ def get_indexer_for(self, target, **kwargs): """ guaranteed return of an indexer even when non-unique """ if self.is_unique: return self.get_indexer(target, **kwargs) - return self.get_indexer_non_unique(target, **kwargs)[0] + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer def _possibly_promote(self, other): # A hack, but it works @@ -1650,7 +1786,7 @@ def _possibly_promote(self, other): if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other elif self.inferred_type == 'boolean': - if self.dtype != 'object': + if not is_object_dtype(self.dtype): return self.astype('object'), other.astype('object') return self, other @@ -1702,12 +1838,35 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember(self._array_values(), value_set) + return lib.ismember(np.array(self), value_set) + + def _can_reindex(self, indexer): + """ + *this is an internal non-public method* + + Check if we are allowing reindexing with this particular indexer + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None): """ Create index with target's values (move/add/delete values as necessary) + Parameters + ---------- + target : an iterable + Returns ------- new_index : pd.Index @@ -1728,6 +1887,7 @@ def reindex(self, target, method=None, level=None, limit=None): target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) else: target = _ensure_index(target) + if level is not None: if method is not None: raise TypeError('Fill method not supported if level passed') @@ -1752,9 +1912,72 @@ def reindex(self, target, method=None, level=None, limit=None): return target, indexer + def _reindex_non_unique(self, target): + """ + *this is an internal non-public method* + + Create a new index with target's values (move/add/delete values as necessary) + use with non-unique Index and a possibly non-unique target + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + target = _ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None + + if len(missing): + l = np.arange(len(indexer)) + + missing = com._ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = com._ensure_int64(l[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = com._ensure_int64(l[check]) + + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # a unique indexer + if target.is_unique: + + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + + # need to retake to have the same size as the indexer + indexer = indexer.values + indexer[~check] = 0 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + return self._shallow_copy(new_labels), indexer, new_indexer + def join(self, other, how='left', level=None, return_indexers=False): """ - Internal API method. Compute join_index and indexers to conform data + *this is an internal non-public method* + + Compute join_index and indexers to conform data structures to the new index. Parameters @@ -1813,7 +2036,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, @@ -2325,13 +2548,15 @@ def insert(self, loc, item): (_self[:loc], item_idx, _self[loc:])) return Index(idx, name=self.name) - def drop(self, labels): + def drop(self, labels, errors='raise'): """ Make new Index with passed list of labels deleted Parameters ---------- labels : array-like + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- @@ -2341,7 +2566,9 @@ def drop(self, labels): indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - raise ValueError('labels %s not contained in axis' % labels[mask]) + if errors != 'ignore': + raise ValueError('labels %s not contained in axis' % labels[mask]) + indexer = indexer[~mask] return self.delete(indexer) @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) @@ -2360,6 +2587,49 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + func = getattr(self.values, op) + result = func(np.asarray(other)) + + # technically we could support bool dtyped Index + # for now just return the indexing array directly + if is_bool_dtype(result): + return result + try: + return Index(result) + except TypeError: + return result + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + + @classmethod + def _add_numericlike_set_methods_disabled(cls): + """ add in the numeric set-like methods to disable """ + + def _make_invalid_op(name): + + def invalid_op(self, other=None): + raise TypeError("cannot perform {name} with this index type: {typ}".format(name=name, + typ=type(self))) + invalid_op.__name__ = name + return invalid_op + + cls.__add__ = cls.__radd__ = __iadd__ = _make_invalid_op('__add__') + cls.__sub__ = __isub__ = _make_invalid_op('__sub__') + @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable """ @@ -2414,7 +2684,7 @@ def _evaluate_numeric_binop(self, other): elif isinstance(other, (Timestamp, np.datetime64)): return self._evaluate_with_datetime_like(other, op, opstr) else: - if not (com.is_float(other) or com.is_integer(other)): + if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with scalar values") # if we are a reversed non-communative op @@ -2478,7 +2748,7 @@ def _make_logical_function(name, desc, f): @Appender(_doc) def logical_func(self, *args, **kwargs): result = f(self.values) - if isinstance(result, (np.ndarray, com.ABCSeries, Index)) \ + if isinstance(result, (np.ndarray, ABCSeries, Index)) \ and result.ndim == 0: # return NumPy type return result.dtype.type(result.item()) @@ -2510,6 +2780,529 @@ def invalid_op(self, other=None): Index._add_numeric_methods_disabled() Index._add_logical_methods() +Index._add_comparison_methods() + +class CategoricalIndex(Index, PandasDelegate): + """ + + Immutable Index implementing an ordered, sliceable set. CategoricalIndex + represents a sparsely populated Index with an underlying Categorical. + + Parameters + ---------- + data : array-like or Categorical, (1-dimensional) + categories : optional, array-like + categories for the CategoricalIndex + ordered : boolean, + designating if the categories are ordered + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + """ + + _typ = 'categoricalindex' + _engine_type = _index.Int64Engine + _attributes = ['name','categories','ordered'] + + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): + + if fastpath: + return cls._simple_new(data, name=name) + + if isinstance(data, ABCCategorical): + data = cls._create_categorical(cls, data, categories, ordered) + elif isinstance(data, CategoricalIndex): + data = data._data + data = cls._create_categorical(cls, data, categories, ordered) + else: + + # don't allow scalars + # if data is None, then categories must be provided + if lib.isscalar(data): + if data is not None or categories is None: + cls._scalar_data_error(data) + data = [] + data = cls._create_categorical(cls, data, categories, ordered) + + if copy: + data = data.copy() + + return cls._simple_new(data, name=name) + + def _create_from_codes(self, codes, categories=None, ordered=None, name=None): + """ + *this is an internal non-public method* + + create the correct categorical from codes + + Parameters + ---------- + codes : new codes + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + name : optional name attribute, defaults to existing + + Returns + ------- + CategoricalIndex + """ + + from pandas.core.categorical import Categorical + if categories is None: + categories = self.categories + if ordered is None: + ordered = self.ordered + if name is None: + name = self.name + cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) + return CategoricalIndex(cat, name=name) + + @staticmethod + def _create_categorical(self, data, categories=None, ordered=None): + """ + *this is an internal non-public method* + + create the correct categorical from data and the properties + + Parameters + ---------- + data : data for new Categorical + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + + Returns + ------- + Categorical + """ + + if not isinstance(data, ABCCategorical): + from pandas.core.categorical import Categorical + data = Categorical(data, categories=categories, ordered=ordered) + else: + if categories is not None: + data = data.set_categories(categories) + if ordered is not None: + data = data.set_ordered(ordered) + return data + + @classmethod + def _simple_new(cls, values, name=None, categories=None, ordered=None, **kwargs): + result = object.__new__(cls) + + values = cls._create_categorical(cls, values, categories, ordered) + result._data = values + result.name = name + for k, v in compat.iteritems(kwargs): + setattr(result,k,v) + + result._reset_identity() + return result + + def _is_dtype_compat(self, other): + """ + *this is an internal non-public method* + + provide a comparison between the dtype of self and other (coercing if needed) + + Raises + ------ + TypeError if the dtypes are not compatible + """ + + if is_categorical_dtype(other): + if isinstance(other, CategoricalIndex): + other = other.values + if not other.is_dtype_equal(self): + raise TypeError("categories must match existing categories when appending") + else: + values = other + other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) + if not other.isin(values).all(): + raise TypeError("cannot append a non-category item to a CategoricalIndex") + + return other + + def equals(self, other): + """ + Determines if two CategorialIndex objects contain the same elements. + """ + if self.is_(other): + return True + + try: + other = self._is_dtype_compat(other) + return array_equivalent(self._data, other) + except (TypeError, ValueError): + pass + + return False + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + max_categories = (10 if get_option("display.max_categories") == 0 + else get_option("display.max_categories")) + attrs = [('categories', default_pprint(self.categories, max_seq_items=max_categories)), + ('ordered',self.ordered)] + if self.name is not None: + attrs.append(('name',default_pprint(self.name))) + attrs.append(('dtype',"'%s'" % self.dtype)) + max_seq_items = get_option('display.max_seq_items') + if len(self) > max_seq_items: + attrs.append(('length',len(self))) + return attrs + + @property + def inferred_type(self): + return 'categorical' + + @property + def values(self): + """ return the underlying data, which is a Categorical """ + return self._data + + @property + def codes(self): + return self._data.codes + + @property + def categories(self): + return self._data.categories + + @property + def ordered(self): + return self._data.ordered + + def __contains__(self, key): + hash(key) + return key in self.values + + def __array__(self, dtype=None): + """ the array interface, return my values """ + return np.array(self._data, dtype=dtype) + + def argsort(self, *args, **kwargs): + return self.values.argsort(*args, **kwargs) + + @cache_readonly + def _engine(self): + + # we are going to look things up with the codes themselves + return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + + @cache_readonly + def is_unique(self): + return not self.duplicated().any() + + @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + def duplicated(self, take_last=False): + from pandas.hashtable import duplicated_int64 + return duplicated_int64(self.codes.astype('i8'), take_last) + + def get_loc(self, key, method=None): + """ + Get integer location for requested label + + Parameters + ---------- + key : label + method : {None} + * default: exact matches only. + + Returns + ------- + loc : int if unique index, possibly slice or mask if not + """ + codes = self.categories.get_loc(key) + if (codes == -1): + raise KeyError(key) + indexer, _ = self._engine.get_indexer_non_unique(np.array([codes])) + if (indexer==-1).any(): + raise KeyError(key) + + return indexer + + def _can_reindex(self, indexer): + """ always allow reindexing """ + pass + + def reindex(self, target, method=None, level=None, limit=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + if method is not None: + raise NotImplementedError("argument method is not implemented for CategoricalIndex.reindex") + if level is not None: + raise NotImplementedError("argument level is not implemented for CategoricalIndex.reindex") + if limit is not None: + raise NotImplementedError("argument limit is not implemented for CategoricalIndex.reindex") + + target = _ensure_index(target) + + if not is_categorical_dtype(target) and not target.is_unique: + raise ValueError("cannot reindex with a non-unique indexer") + + indexer, missing = self.get_indexer_non_unique(np.array(target)) + new_target = self.take(indexer) + + + # filling in missing if needed + if len(missing): + cats = self.categories.get_indexer(target) + if (cats==-1).any(): + + # coerce to a regular index here! + result = Index(np.array(self),name=self.name) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) + + else: + + codes = new_target.codes.copy() + codes[indexer==-1] = cats[missing] + new_target = self._create_from_codes(codes) + + # we always want to return an Index type here + # to be consistent with .reindex for other index types (e.g. they don't coerce + # based on the actual values, only on the dtype) + # unless we had an inital Categorical to begin with + # in which case we are going to conform to the passed Categorical + new_target = np.asarray(new_target) + if is_categorical_dtype(target): + new_target = target._shallow_copy(new_target, name=self.name) + else: + new_target = Index(new_target, name=self.name) + + return new_target, indexer + + def _reindex_non_unique(self, target): + """ reindex from a non-unique; which CategoricalIndex's are almost always """ + new_target, indexer = self.reindex(target) + new_indexer = None + + check = indexer==-1 + if check.any(): + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[check] = -1 + + return new_target, indexer, new_indexer + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : MultiIndex or Index (of tuples) + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = com._clean_reindex_fill_method(method) + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + if method == 'pad' or method == 'backfill': + raise NotImplementedError("method='pad' and method='backfill' not implemented yet " + 'for CategoricalIndex') + elif method == 'nearest': + raise NotImplementedError("method='nearest' not implemented yet " + 'for CategoricalIndex') + else: + + codes = self.categories.get_indexer(target) + indexer, _ = self._engine.get_indexer_non_unique(codes) + + return com._ensure_platform_int(indexer) + + def get_indexer_non_unique(self, target): + """ this is the same for a CategoricalIndex for get_indexer; the API returns the missing values as well """ + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + codes = self.categories.get_indexer(target) + return self._engine.get_indexer_non_unique(codes) + + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list indexer. + Return our indexer or raise if all of the values are not included in the categories + """ + codes = self.categories.get_indexer(keyarr) + if (codes==-1).any(): + raise KeyError("a list-indexer must only include values that are in the categories") + + return None + + def take(self, indexer, axis=0): + """ + return a new CategoricalIndex of the values selected by the indexer + + See also + -------- + numpy.ndarray.take + """ + + indexer = com._ensure_platform_int(indexer) + taken = self.codes.take(indexer) + return self._create_from_codes(taken) + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + return self._create_from_codes(np.delete(self.codes, loc)) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not in the categories + + """ + code = self.categories.get_indexer([item]) + if (code == -1): + raise TypeError("cannot insert an item into a CategoricalIndex that is not already an existing category") + + codes = self.codes + codes = np.concatenate( + (codes[:loc], code, codes[loc:])) + return self._create_from_codes(codes) + + def append(self, other): + """ + Append a collection of CategoricalIndex options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + + Raises + ------ + ValueError if other is not in the categories + """ + to_concat, name = self._ensure_compat_append(other) + to_concat = [ self._is_dtype_compat(c) for c in to_concat ] + codes = np.concatenate([ c.codes for c in to_concat ]) + return self._create_from_codes(codes, name=name) + + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + + # if we have a Categorical type, then must have the same categories + if isinstance(other, CategoricalIndex): + other = other.values + elif isinstance(other, Index): + other = self._create_categorical(self, other.values, categories=self.categories, ordered=self.ordered) + + if isinstance(other, ABCCategorical): + if not (self.values.is_dtype_equal(other) and len(self.values) == len(other)): + raise TypeError("categorical index comparisions must have the same categories and ordered attributes") + + return getattr(self.values, op)(other) + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the .values """ + method = getattr(self.values, name) + if 'inplace' in kwargs: + raise ValueError("cannot use inplace with CategoricalIndex") + res = method(*args, **kwargs) + if lib.isscalar(res): + return res + return CategoricalIndex(res, name=self.name) + + @classmethod + def _add_accessors(cls): + """ add in Categorical accessor methods """ + + from pandas.core.categorical import Categorical + CategoricalIndex._add_delegate_accessors(delegate=Categorical, + accessors=["rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + "min", + "max"], + typ='method', + overwrite=True) + + +CategoricalIndex._add_numericlike_set_methods_disabled() +CategoricalIndex._add_numeric_methods_disabled() +CategoricalIndex._add_logical_methods_disabled() +CategoricalIndex._add_comparison_methods() +CategoricalIndex._add_accessors() class NumericIndex(Index): @@ -2782,7 +3575,7 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if self.dtype != other.dtype or self.shape != other.shape: + if not is_dtype_equal(self.dtype,other.dtype) or self.shape != other.shape: return False left, right = self.values, other.values return ((left == right) | (self._isnan & other._isnan)).all() @@ -2848,7 +3641,7 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember_nans(self._array_values(), value_set, + return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) @@ -3188,7 +3981,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, verify_integrity=False, _set_identity=_set_identity) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self.values @@ -3200,10 +3993,6 @@ def view(self, cls=None): _shallow_copy = view - def _array_values(self): - # hack for various methods - return self.values - @cache_readonly def dtype(self): return np.dtype('O') @@ -3216,40 +4005,24 @@ def nbytes(self): names_nbytes = sum(( getsizeof(i) for i in self.names )) return level_nbytes + label_nbytes + names_nbytes - def __repr__(self): - encoding = get_option('display.encoding') - attrs = [('levels', default_pprint(self.levels)), - ('labels', default_pprint(self.labels))] + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + attrs = [('levels', default_pprint(self._levels, max_seq_items=False)), + ('labels', default_pprint(self._labels, max_seq_items=False))] if not all(name is None for name in self.names): attrs.append(('names', default_pprint(self.names))) if self.sortorder is not None: attrs.append(('sortorder', default_pprint(self.sortorder))) + return attrs - space = ' ' * (len(self.__class__.__name__) + 1) - prepr = (u(",\n%s") % space).join([u("%s=%s") % (k, v) - for k, v in attrs]) - res = u("%s(%s)") % (self.__class__.__name__, prepr) - - if not compat.PY3: - # needs to be str in Python 2 - res = res.encode(encoding) - return res - - def __unicode__(self): - """ - Return a string representation for a particular Index + def _format_space(self): + return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - rows = self.format(names=True) - max_rows = get_option('display.max_rows') - if len(rows) > max_rows: - spaces = (len(rows[0]) - 3) // 2 - centered = ' ' * spaces - half = max_rows // 2 - rows = rows[:half] + [centered + '...' + centered] + rows[-half:] - return "\n".join(rows) + def _format_data(self): + # we are formatting thru the attributes + return None def __len__(self): return len(self.labels[0]) @@ -3293,7 +4066,7 @@ def _reference_duplicate_name(self, name): return np.sum(name == np.asarray(self.names)) > 1 def _format_native_types(self, **kwargs): - return self.tolist() + return self.values @property def _constructor(self): @@ -3350,7 +4123,7 @@ def values(self): taken = com.take_1d(lev._box_values(lev.values), lab, fill_value=_get_na_value(lev.dtype.type)) else: - taken = com.take_1d(lev.values, lab) + taken = com.take_1d(np.asarray(lev.values), lab) values.append(taken) self._tuples = lib.fast_zip(values) @@ -3415,7 +4188,7 @@ def _try_mi(k): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -3847,7 +4620,7 @@ def repeat(self, n): sortorder=self.sortorder, verify_integrity=False) - def drop(self, labels, level=None): + def drop(self, labels, level=None, errors='raise'): """ Make new MultiIndex with passed list of labels deleted @@ -3870,19 +4643,24 @@ def drop(self, labels, level=None): indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - raise ValueError('labels %s not contained in axis' - % labels[mask]) - return self.delete(indexer) + if errors != 'ignore': + raise ValueError('labels %s not contained in axis' + % labels[mask]) + indexer = indexer[~mask] except Exception: pass inds = [] for label in labels: - loc = self.get_loc(label) - if isinstance(loc, int): - inds.append(loc) - else: - inds.extend(lrange(loc.start, loc.stop)) + try: + loc = self.get_loc(label) + if isinstance(loc, int): + inds.append(loc) + else: + inds.extend(lrange(loc.start, loc.stop)) + except KeyError: + if errors != 'ignore': + raise return self.delete(inds) @@ -4014,7 +4792,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): labels = list(self.labels) shape = list(self.levshape) - if isinstance(level, (str, int)): + if isinstance(level, (compat.string_types, int)): level = [level] level = [self._get_level_number(lev) for lev in level] @@ -4081,7 +4859,7 @@ def get_indexer(self, target, method=None, limit=None): if isinstance(target, MultiIndex): target_index = target._tuple_index - if target_index.dtype != object: + if not is_object_dtype(target_index.dtype): return np.ones(len(target_index)) * -1 if not self.is_unique: @@ -4640,9 +5418,9 @@ def equals(self, other): return False for i in range(self.nlevels): - svalues = com.take_nd(self.levels[i].values, self.labels[i], + svalues = com.take_nd(np.asarray(self.levels[i].values), self.labels[i], allow_fill=False) - ovalues = com.take_nd(other.levels[i].values, other.labels[i], + ovalues = com.take_nd(np.asarray(other.levels[i].values), other.labels[i], allow_fill=False) if not array_equivalent(svalues, ovalues): return False @@ -4677,12 +5455,11 @@ def union(self, other): >>> index.union(index2) """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): return self - result_names = self.names if self.names == other.names else None - uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -4700,12 +5477,11 @@ def intersection(self, other): Index """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if self.equals(other): return self - result_names = self.names if self.names == other.names else None - self_tuples = self.values other_tuples = other.values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) @@ -4726,18 +5502,10 @@ def difference(self, other): diff : MultiIndex """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) - if not isinstance(other, MultiIndex): - if len(other) == 0: + if len(other) == 0: return self - try: - other = MultiIndex.from_tuples(other) - except: - raise TypeError('other must be a MultiIndex or a list of' - ' tuples') - result_names = self.names - else: - result_names = self.names if self.names == other.names else None if self.equals(other): return MultiIndex(levels=[[]] * self.nlevels, @@ -4754,15 +5522,30 @@ def difference(self, other): return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - def _assert_can_do_setop(self, other): - pass - def astype(self, dtype): - if np.dtype(dtype) != np.object_: + if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) return self._shallow_copy() + def _convert_can_do_setop(self, other): + result_names = self.names + + if not hasattr(other, 'names'): + if len(other) == 0: + other = MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + verify_integrity=False) + else: + msg = 'other must be a MultiIndex or a list of tuples' + try: + other = MultiIndex.from_tuples(other) + except: + raise TypeError(msg) + else: + result_names = self.names if self.names == other.names else None + return other, result_names + def insert(self, loc, item): """ Make new MultiIndex inserting new item at location @@ -4838,7 +5621,7 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - return lib.ismember(self._array_values(), set(values)) + return lib.ismember(np.array(self), set(values)) else: num = self._get_level_number(level) levs = self.levels[num] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 920e8aa04aa1f..e0f06e22c431b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,7 +1,6 @@ # pylint: disable=W0223 -from datetime import datetime -from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.index import Index, MultiIndex from pandas.compat import range, zip import pandas.compat as compat import pandas.core.common as com @@ -10,8 +9,6 @@ is_null_slice, ABCSeries, ABCDataFrame, ABCPanel, is_float, _values_from_object, _infer_fill_value, is_integer) -import pandas.lib as lib - import numpy as np # the supported indexers @@ -200,7 +197,6 @@ def _has_valid_positional_setitem_indexer(self, indexer): return True def _setitem_with_indexer(self, indexer, value): - self._has_valid_setitem_indexer(indexer) # also has the side effect of consolidating in-place @@ -254,7 +250,7 @@ def _setitem_with_indexer(self, indexer, value): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = safe_append_to_index(index, key) + labels = index.insert(len(index),key) self.obj._data = self.obj.reindex_axis(labels, i)._data self.obj._maybe_update_cacher(clear=True) self.obj.is_copy=None @@ -275,10 +271,7 @@ def _setitem_with_indexer(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - if len(index) == 0: - new_index = Index([indexer]) - else: - new_index = safe_append_to_index(index, indexer) + new_index = index.insert(len(index),indexer) # this preserves dtype of the value new_values = Series([value]).values @@ -486,8 +479,8 @@ def can_do_equal_len(): self.obj[item_labels[indexer[info_axis]]] = value return - if isinstance(value, ABCSeries): - value = self._align_series(indexer, value) + if isinstance(value, (ABCSeries, dict)): + value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): value = self._align_frame(indexer, value) @@ -929,24 +922,6 @@ def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) - def _reindex(keys, level=None): - - try: - result = self.obj.reindex_axis(keys, axis=axis, level=level) - except AttributeError: - # Series - if axis != 0: - raise AssertionError('axis must be 0') - return self.obj.reindex(keys, level=level) - - # this is an error as we are trying to find - # keys in a multi-index that don't exist - if isinstance(labels, MultiIndex) and level is not None: - if hasattr(result,'ndim') and not np.prod(result.shape) and len(keys): - raise KeyError("cannot index a multi-index axis with these keys") - - return result - if is_bool_indexer(key): key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -959,8 +934,9 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - # handle a mixed integer scenario - indexer = labels._convert_list_indexer_for_mixed(keyarr, kind=self.name) + # have the index handle the indexer and possibly return + # an indexer or raising + indexer = labels._convert_list_indexer(keyarr, kind=self.name) if indexer is not None: return self.obj.take(indexer, axis=axis) @@ -971,65 +947,48 @@ def _reindex(keys, level=None): else: level = None - keyarr_is_unique = Index(keyarr).is_unique + # existing labels are unique and indexer are unique + if labels.is_unique and Index(keyarr).is_unique: + + try: + result = self.obj.reindex_axis(keyarr, axis=axis, level=level) - # existing labels are unique and indexer is unique - if labels.is_unique and keyarr_is_unique: - return _reindex(keyarr, level=level) + # this is an error as we are trying to find + # keys in a multi-index that don't exist + if isinstance(labels, MultiIndex) and level is not None: + if hasattr(result,'ndim') and not np.prod(result.shape) and len(keyarr): + raise KeyError("cannot index a multi-index axis with these keys") + return result + + except AttributeError: + + # Series + if axis != 0: + raise AssertionError('axis must be 0') + return self.obj.reindex(keyarr, level=level) + + # existing labels are non-unique else: - indexer, missing = labels.get_indexer_non_unique(keyarr) - check = indexer != -1 - result = self.obj.take(indexer[check], axis=axis, - convert=False) - - # need to merge the result labels and the missing labels - if len(missing): - l = np.arange(len(indexer)) - - missing = com._ensure_platform_int(missing) - missing_labels = keyarr.take(missing) - missing_indexer = com._ensure_int64(l[~check]) - cur_labels = result._get_axis(axis).values - cur_indexer = com._ensure_int64(l[check]) - - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels - - # reindex with the specified axis - ndim = self.obj.ndim - if axis + 1 > ndim: - raise AssertionError("invalid indexing error with " - "non-unique index") - - # a unique indexer - if keyarr_is_unique: - - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange( - len(result._get_axis(axis)) - ) - new_indexer[missing_indexer] = -1 - # we have a non_unique selector, need to use the original - # indexer here - else: + # reindex with the specified axis + if axis + 1 > self.obj.ndim: + raise AssertionError("invalid indexing error with " + "non-unique index") - # need to retake to have the same size as the indexer - rindexer = indexer.values - rindexer[~check] = 0 - result = self.obj.take(rindexer, axis=axis, - convert=False) + new_target, indexer, new_indexer = labels._reindex_non_unique(keyarr) - # reset the new indexer to account for the new size - new_indexer = np.arange(len(result)) - new_indexer[~check] = -1 + if new_indexer is not None: + result = self.obj.take(indexer[indexer!=-1], axis=axis, + convert=False) result = result._reindex_with_indexers({ - axis: [new_labels, new_indexer] - }, copy=True, allow_dups=True) + axis: [new_target, new_indexer] + }, copy=True, allow_dups=True) + + else: + result = self.obj.take(indexer, axis=axis, + convert=False) return result @@ -1106,8 +1065,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): else: objarr = _asarray_tuplesafe(obj) - # If have integer labels, defer to label-based indexing - indexer = labels._convert_list_indexer_for_mixed(objarr, kind=self.name) + # The index may want to handle a list indexer differently + # by returning an indexer or raising + indexer = labels._convert_list_indexer(objarr, kind=self.name) if indexer is not None: return indexer @@ -1628,8 +1588,8 @@ def length_of_indexer(indexer, target=None): if step is None: step = 1 elif step < 0: - step = abs(step) - return (stop - start) / step + step = -step + return (stop - start + step-1) // step elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)): return len(indexer) elif not is_list_like_indexer(indexer): @@ -1720,19 +1680,6 @@ def get_indexer(_i, _idx): return tuple([get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)]) -def safe_append_to_index(index, key): - """ a safe append to an index, if incorrect type, then catch and recreate - """ - try: - return index.insert(len(index), key) - except: - - # raise here as this is basically an unsafe operation and we want - # it to be obvious that you are doing something wrong - raise ValueError("unsafe appending to index of type {0} with a key " - "{1}".format(index.__class__.__name__, key)) - - def maybe_convert_indices(indices, n): """ if we have negative indicies, translate to postive here if have indicies that are out-of-bounds, raise an IndexError diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7a16fb2b6b0d7..3395ea360165e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -294,8 +294,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): mask = isnull(self.values) if limit is not None: if self.ndim > 2: - raise NotImplementedError - mask[mask.cumsum(self.ndim-1)>limit]=False + raise NotImplementedError("number of dimensions for 'fillna' " + "is currently limited to 2") + mask[mask.cumsum(self.ndim-1) > limit] = False value = self._try_fill(value) blocks = self.putmask(mask, value, inplace=inplace) @@ -483,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None): def _try_fill(self, value): return value - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) + + if not self.is_object and not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + values[mask] = na_rep - return values.tolist() + return values # block actions #### def copy(self, deep=True): @@ -581,7 +587,7 @@ def _is_empty_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return all([ isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer ]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False # empty indexers @@ -868,9 +874,9 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): def get_values(self, dtype=None): return self.values - def diff(self, n): + def diff(self, n, axis=1): """ return block for the diff of the values """ - new_values = com.diff(self.values, n, axis=1) + new_values = com.diff(self.values, n, axis=axis) return [make_block(values=new_values, ndim=self.ndim, fastpath=True, placement=self.mgr_locs)] @@ -1220,32 +1226,34 @@ def _try_cast(self, element): return element def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.', - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - values = np.array(values, dtype=object) mask = isnull(values) - values[mask] = na_rep - + formatter = None if float_format and decimal != '.': formatter = lambda v : (float_format % v).replace('.',decimal,1) elif decimal != '.': formatter = lambda v : ('%g' % v).replace('.',decimal,1) elif float_format: formatter = lambda v : float_format % v + + if formatter is None and not quoting: + values = values.astype(str) else: - formatter = None + values = np.array(values, dtype='object') + values[mask] = na_rep if formatter: imask = (~mask).ravel() values.flat[imask] = np.array( [formatter(val) for val in values.ravel()[imask]]) - return values.tolist() + return values def should_store(self, value): # when inserting a column should not coerce integers to floats @@ -1324,13 +1332,11 @@ def _try_fill(self, value): return value def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments - we are going to compare vs i8, so coerce to floats - repring NaT with np.nan so nans propagate - values is always ndarray like, other may not be """ + """ Coerce values and other to float64, with null values converted to + NaN. values is always ndarray-like, other may not be """ def masker(v): mask = isnull(v) - v = v.view('i8').astype('float64') + v = v.astype('float64') v[mask] = np.nan return v @@ -1342,6 +1348,8 @@ def masker(v): other = _coerce_scalar_to_timedelta_type(other, unit='s', box=False).item() if other == tslib.iNaT: other = np.nan + elif lib.isscalar(other): + other = np.float64(other) else: other = masker(other) @@ -1365,7 +1373,7 @@ def _try_coerce_result(self, result): def should_store(self, value): return issubclass(value.dtype.type, np.timedelta64) - def to_native_types(self, slicer=None, na_rep=None, **kwargs): + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -1386,7 +1394,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') for val in values.ravel()[imask]], dtype=object) - return rvalues.tolist() + return rvalues def get_values(self, dtype=None): @@ -1681,10 +1689,11 @@ def _slice(self, slicer): def fillna(self, value, limit=None, inplace=False, downcast=None): # we may need to upcast our fill to match our dtype if limit is not None: - raise NotImplementedError + raise NotImplementedError("specifying a limit for 'fillna' has " + "not been implemented yet") values = self.values if inplace else self.values.copy() - return [self.make_block_same_class(values=values.fillna(fill_value=value, + return [self.make_block_same_class(values=values.fillna(value=value, limit=limit), placement=self.mgr_locs)] @@ -1761,18 +1770,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, ndim=self.ndim, placement=self.mgr_locs) - def to_native_types(self, slicer=None, na_rep='', **kwargs): + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: # Categorical is always one dimension values = values[slicer] - values = np.array(values, dtype=object) mask = isnull(values) + values = np.array(values, dtype='object') values[mask] = na_rep - # Blocks.to_native_type returns list of lists, but we are always only a list - return [values.tolist()] + + # we are expected to return a 2-d ndarray + return values.reshape(1,len(values)) class DatetimeBlock(Block): __slots__ = () @@ -1807,16 +1817,20 @@ def _try_operate(self, values): return values.view('i8') def _try_coerce_args(self, values, other): - """ provide coercion to our input arguments - we are going to compare vs i8, so coerce to integer - values is always ndarra like, other may not be """ + """ Coerce values and other to dtype 'i8'. NaN and NaT convert to + the smallest i8, and will correctly round-trip to NaT if converted + back in _try_coerce_result. values is always ndarray-like, other + may not be """ values = values.view('i8') + if is_null_datelike_scalar(other): other = tslib.iNaT elif isinstance(other, datetime): other = lib.Timestamp(other).asm8.view('i8') - else: + elif hasattr(other, 'dtype') and com.is_integer_dtype(other): other = other.view('i8') + else: + other = np.array(other, dtype='i8') return values, other @@ -1848,7 +1862,8 @@ def fillna(self, value, limit=None, value = self._try_fill(value) if limit is not None: if self.ndim > 2: - raise NotImplementedError + raise NotImplementedError("number of dimensions for 'fillna' " + "is currently limited to 2") mask[mask.cumsum(self.ndim-1)>limit]=False np.putmask(values, mask, value) @@ -1857,29 +1872,21 @@ def fillna(self, value, limit=None, fastpath=True, placement=self.mgr_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, - **kwargs): + quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values if slicer is not None: values = values[:, slicer] - mask = isnull(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = 'NaT' - rvalues[mask] = na_rep - imask = (~mask).ravel() - if date_format is None: - date_formatter = lambda x: Timestamp(x)._repr_base - else: - date_formatter = lambda x: Timestamp(x).strftime(date_format) - - rvalues.flat[imask] = np.array([date_formatter(val) for val in - values.ravel()[imask]], dtype=object) + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(values, date_format) - return rvalues.tolist() + result = tslib.format_array_from_datetime(values.view('i8').ravel(), + tz=None, + format=format, + na_rep=na_rep).reshape(values.shape) + return result def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) @@ -2011,7 +2018,8 @@ def interpolate(self, method='pad', axis=0, inplace=False, def fillna(self, value, limit=None, inplace=False, downcast=None): # we may need to upcast our fill to match our dtype if limit is not None: - raise NotImplementedError + raise NotImplementedError("specifying a limit for 'fillna' has " + "not been implemented yet") if issubclass(self.dtype.type, np.floating): value = float(value) values = self.values if inplace else self.values.copy() @@ -3126,7 +3134,6 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, pandas-indexer with -1's only. """ - if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -3138,10 +3145,9 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, self._consolidate_inplace() - # trying to reindex on an axis with duplicates - if (not allow_dups and not self.axes[axis].is_unique - and len(indexer)): - raise ValueError("cannot reindex from a duplicate axis") + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -3310,8 +3316,20 @@ def equals(self, other): return False self._consolidate_inplace() other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the type + # name and then mgr_locs because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.dtype.name, block.mgr_locs.as_array.tolist()) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) return all(block.equals(oblock) for block, oblock in - zip(self.blocks, other.blocks)) + zip(self_blocks, other_blocks)) class SingleBlockManager(BlockManager): @@ -3999,7 +4017,8 @@ def _putmask_smart(v, m, n): try: nn = n[m] nn_at = nn.astype(v.dtype) - if (nn == nn_at).all(): + comp = (nn == nn_at) + if is_list_like(comp) and comp.all(): nv = v.copy() nv[m] = nn_at return nv diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f68f4f9037d97..0df160618b7c3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,7 +1,5 @@ -import sys import itertools import functools - import numpy as np try: @@ -10,7 +8,6 @@ except ImportError: # pragma: no cover _USE_BOTTLENECK = False -import pandas.core.common as com import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib from pandas.compat import builtins @@ -23,7 +20,7 @@ is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, + is_datetime_or_timedelta_dtype, _get_dtype, is_int_or_datetime_dtype, is_any_int_dtype) @@ -257,8 +254,16 @@ def nansum(values, axis=None, skipna=True): @bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max)) - count = _get_counts(mask, axis) + + dtype_sum = dtype_max + dtype_count = np.float64 + if is_integer_dtype(dtype): + dtype_sum = np.float64 + elif is_float_dtype(dtype): + dtype_sum = dtype + dtype_count = dtype + count = _get_counts(mask, axis, dtype=dtype_count) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count @@ -285,6 +290,7 @@ def get_median(x): if values.dtype != np.float64: values = values.astype('f8') + values[mask] = np.nan if axis is None: values = values.ravel() @@ -559,15 +565,16 @@ def _maybe_arg_null_out(result, axis, mask, skipna): return result -def _get_counts(mask, axis): +def _get_counts(mask, axis, dtype=float): + dtype = _get_dtype(dtype) if axis is None: - return float(mask.size - mask.sum()) + return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) try: - return count.astype(float) + return count.astype(dtype) except AttributeError: - return np.array(count, dtype=float) + return np.array(count, dtype=dtype) def _maybe_null_out(result, axis, mask): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 954d2c8a77326..0b62eb1e53ddb 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -213,7 +213,7 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, Parameters ---------- - flex_arith_method : function (optional) + flex_arith_method : function factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) radd_func : function (optional) @@ -571,7 +571,11 @@ def na_op(x, y): return result - def wrapper(self, other): + def wrapper(self, other, axis=None): + # Validate the axis parameter + if axis is not None: + self._get_axis_number(axis) + if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): @@ -594,20 +598,26 @@ def wrapper(self, other): mask = isnull(self) - values = self.get_values() - other = _index.convert_scalar(values,_values_from_object(other)) + if com.is_categorical_dtype(self): + # cats are a special case as get_values() would return an ndarray, which would then + # not take categories ordering into account + # we can go directly to op, as the na_op would just test again and dispatch to it. + res = op(self.values, other) + else: + values = self.get_values() + other = _index.convert_scalar(values,_values_from_object(other)) - if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - values = values.view('i8') + if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + values = values.view('i8') - # scalars - res = na_op(values, other) - if np.isscalar(res): - raise TypeError('Could not compare %s type with Series' - % type(other)) + # scalars + res = na_op(values, other) + if np.isscalar(res): + raise TypeError('Could not compare %s type with Series' + % type(other)) - # always return a full value series here - res = _values_from_object(res) + # always return a full value series here + res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') @@ -693,12 +703,35 @@ def _radd_compat(left, right): return output +_op_descriptions = {'add': {'op': '+', 'desc': 'Addition', 'reversed': False, 'reverse': 'radd'}, + 'sub': {'op': '-', 'desc': 'Subtraction', 'reversed': False, 'reverse': 'rsub'}, + 'mul': {'op': '*', 'desc': 'Multiplication', 'reversed': False, 'reverse': 'rmul'}, + 'mod': {'op': '%', 'desc': 'Modulo', 'reversed': False, 'reverse': 'rmod'}, + 'pow': {'op': '**', 'desc': 'Exponential power', 'reversed': False, 'reverse': 'rpow'}, + 'truediv': {'op': '/', 'desc': 'Floating division', 'reversed': False, 'reverse': 'rtruediv'}, + 'floordiv': {'op': '//', 'desc': 'Integer division', 'reversed': False, 'reverse': 'rfloordiv'}} + +_op_names = list(_op_descriptions.keys()) +for k in _op_names: + reverse_op = _op_descriptions[k]['reverse'] + _op_descriptions[reverse_op] = _op_descriptions[k].copy() + _op_descriptions[reverse_op]['reversed'] = True + _op_descriptions[reverse_op]['reverse'] = k def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs): + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' series' + else: + equiv = 'series ' + op_desc['op'] + ' other' + doc = """ - Binary operator %s with support to substitute a fill_value for missing data - in one of the inputs + %s of series and other, element-wise (binary operator `%s`). + + Equivalent to ``%s``, but with support to substitute a fill_value for + missing data in one of the inputs. Parameters ---------- @@ -713,7 +746,11 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, Returns ------- result : Series - """ % name + + See also + -------- + Series.%s + """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -803,7 +840,48 @@ def na_op(x, y): return result - @Appender(_arith_doc_FRAME % name) + if name in _op_descriptions: + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' dataframe' + else: + equiv = 'dataframe ' + op_desc['op'] + ' other' + + doc = """ + %s of dataframe and other, element-wise (binary operator `%s`). + + Equivalent to ``%s``, but with support to substitute a fill_value for + missing data in one of the inputs. + + Parameters + ---------- + other : Series, DataFrame, or constant + axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on + fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame locations are + missing, the result will be missing + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Notes + ----- + Mismatched indices will be unioned together + + Returns + ------- + result : DataFrame + + See also + -------- + DataFrame.%s + """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + else: + doc = _arith_doc_FRAME % name + + @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, pd.DataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 7df23a54c737d..580510829baff 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -6,7 +6,6 @@ from pandas.compat import (map, zip, range, lrange, lmap, u, OrderedDict, OrderedDefaultdict) from pandas import compat -import sys import warnings import numpy as np from pandas.core.common import (PandasError, _try_sort, _default_index, @@ -27,14 +26,15 @@ deprecate_kwarg) import pandas.core.common as com import pandas.core.ops as ops -import pandas.core.nanops as nanops import pandas.computation.expressions as expressions from pandas import lib +from pandas.core.ops import _op_descriptions + _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', klass="Panel", - axes_single_arg="{0,1,2,'items','major_axis','minor_axis'}") + axes_single_arg="{0, 1, 2, 'items', 'major_axis', 'minor_axis'}") _shared_doc_kwargs['args_transpose'] = ("three positional arguments: each one" "of\n %s" % _shared_doc_kwargs['axes_single_arg']) @@ -239,7 +239,8 @@ def from_dict(cls, data, intersect=False, orient='items', dtype=None): (default). Otherwise if the columns of the values of the passed DataFrame objects should be the items (which in the case of mixed-dtype data you should do), instead pass 'minor' - + dtype : dtype, default None + Data type to force, otherwise infer Returns ------- @@ -1161,6 +1162,14 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, def transpose(self, *args, **kwargs): return super(Panel, self).transpose(*args, **kwargs) + @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) + def fillna(self, value=None, method=None, axis=None, inplace=False, + limit=None, downcast=None, **kwargs): + return super(Panel, self).fillna(value=value, method=method, + axis=axis, inplace=inplace, + limit=limit, downcast=downcast, + **kwargs) + def count(self, axis='major'): """ Return number of observations over requested axis. @@ -1184,13 +1193,17 @@ def count(self, axis='major'): @deprecate_kwarg(old_arg_name='lags', new_arg_name='periods') def shift(self, periods=1, freq=None, axis='major'): """ - Shift major or minor axis by specified number of leads/lags. Drops - periods right now compared with DataFrame.shift + Shift index by desired number of periods with an optional time freq. + The shifted data will not include the dropped periods and the + shifted axis will be smaller than the original. This is different + from the behavior of DataFrame.shift() Parameters ---------- - lags : int - axis : {'major', 'minor'} + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or time rule string, optional + axis : {'items', 'major', 'minor'} or {0, 1, 2} Returns ------- @@ -1199,9 +1212,6 @@ def shift(self, periods=1, freq=None, axis='major'): if freq: return self.tshift(periods, freq, axis=axis) - if axis == 'items': - raise ValueError('Invalid axis') - return super(Panel, self).slice_shift(periods, axis=axis) def tshift(self, periods=1, freq=None, axis='major', **kwds): @@ -1374,6 +1384,7 @@ def _homogenize_dict(self, frames, intersect=True, dtype=None): result[key] = None axes_dict['data'] = result + axes_dict['dtype'] = dtype return axes_dict @staticmethod @@ -1428,7 +1439,7 @@ def _add_aggregate_operations(cls, use_numexpr=True): ---------- other : %s or %s""" % (cls._constructor_sliced.__name__, cls.__name__) + """ axis : {""" + ', '.join(cls._AXIS_ORDERS) + "}" + """ -Axis to broadcast over + Axis to broadcast over Returns ------- @@ -1450,8 +1461,36 @@ def na_op(x, y): result = com._fill_zeros(result, x, y, name, fill_zeros) return result - @Substitution(name) - @Appender(_agg_doc) + if name in _op_descriptions: + op_name = name.replace('__', '') + op_desc = _op_descriptions[op_name] + if op_desc['reversed']: + equiv = 'other ' + op_desc['op'] + ' panel' + else: + equiv = 'panel ' + op_desc['op'] + ' other' + + _op_doc = """ + %%s of series and other, element-wise (binary operator `%%s`). + Equivalent to ``%%s``. + + Parameters + ---------- + other : %s or %s""" % (cls._constructor_sliced.__name__, cls.__name__) + """ + axis : {""" + ', '.join(cls._AXIS_ORDERS) + "}" + """ + Axis to broadcast over + + Returns + ------- + """ + cls.__name__ + """ + + See also + -------- + """ + cls.__name__ + ".%s\n" + doc = _op_doc % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + else: + doc = _agg_doc % name + + @Appender(doc) def f(self, other, axis=0): return self._combine(other, na_op, axis=axis) f.__name__ = name diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index ec0a313ff5767..35e6412efc760 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -1,6 +1,5 @@ """ Factory methods to create N-D panels """ -import pandas.lib as lib from pandas.compat import zip import pandas.compat as compat @@ -99,7 +98,7 @@ def _combine_with_constructor(self, other, func): for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'filter', 'dropna', 'shift']: def func(self, *args, **kwargs): - raise NotImplementedError + raise NotImplementedError("this operation is not supported") setattr(klass, f, func) # add the aggregate operations diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 291a73778197a..3225b4aa33ac2 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -9,9 +9,12 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.sparse import SparseDataFrame, SparseSeries +from pandas.sparse.array import SparseArray +from pandas._sparse import IntIndex + from pandas.core.categorical import Categorical -from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, - isnull) +from pandas.core.common import notnull, _ensure_platform_int, _maybe_promote from pandas.core.groupby import get_group_index, _compress_group_index import pandas.core.common as com @@ -608,7 +611,7 @@ def _convert_level_number(level_num, columns): new_data[key] = value_slice.ravel() if len(drop_cols) > 0: - new_columns = new_columns - drop_cols + new_columns = new_columns.difference(drop_cols) N = len(this) @@ -932,7 +935,7 @@ def melt_stub(df, stub, i, j): return newdf.set_index([i, j]) def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None): + columns=None, sparse=False): """ Convert categorical variable into dummy/indicator variables @@ -953,6 +956,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. + sparse : bool, default False + Whether the returned DataFrame should be sparse or not. Returns ------- @@ -1039,16 +1044,17 @@ def check_len(item, name): with_dummies = [result] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): - dummy = _get_dummies_1d(data[col], prefix=pre, - prefix_sep=sep, dummy_na=dummy_na) + dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, + dummy_na=dummy_na, sparse=sparse) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: - result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na) + result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, + sparse=sparse) return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories @@ -1059,19 +1065,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): index = data.index else: index = np.arange(len(data)) - return DataFrame(index=index) - - number_of_cols = len(levels) - if dummy_na: - number_of_cols += 1 - - dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) + if not sparse: + return DataFrame(index=index) + else: + return SparseDataFrame(index=index) + codes = cat.codes.copy() if dummy_na: + codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) - else: - # reset NaN GH4446 - dummy_mat[cat.codes == -1] = 0 + + number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) @@ -1084,7 +1088,31 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): else: index = None - return DataFrame(dummy_mat, index=index, columns=dummy_cols) + if sparse: + sparse_series = {} + N = len(data) + sp_indices = [ [] for _ in range(len(dummy_cols)) ] + for ndx, code in enumerate(codes): + if code == -1: + # Blank entries if not dummy_na and code == -1, #GH4446 + continue + sp_indices[code].append(ndx) + + for col, ixs in zip(dummy_cols, sp_indices): + sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), + fill_value=0) + sparse_series[col] = SparseSeries(data=sarr, index=index) + + return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + + else: + dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + + if not dummy_na: + # reset NaN GH4446 + dummy_mat[codes == -1] = 0 + + return DataFrame(dummy_mat, index=index, columns=dummy_cols) def make_axis_dummies(frame, axis='minor', transform=None): diff --git a/pandas/core/series.py b/pandas/core/series.py index 68f3a6032402f..c54bd96f64c73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,8 +19,8 @@ is_list_like, _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, - ABCSparseArray, _maybe_match_name, _coerce_to_dtype, - _ensure_object, SettingWithCopyError, + ABCSparseArray, _maybe_match_name, + _coerce_to_dtype, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index) @@ -28,7 +28,6 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor -from pandas.core.strings import StringMethods from pandas.tseries.common import (maybe_to_datetimelike, CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex @@ -60,7 +59,7 @@ _shared_doc_kwargs = dict( axes='index', klass='Series', - axes_single_arg="{0,'index'}", + axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", duplicated='Series' @@ -141,7 +140,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): - raise NotImplementedError + raise NotImplementedError("initializing a Series from a " + "MultiIndex is not supported") elif isinstance(data, Index): # need to copy to avoid aliasing issues if name is None: @@ -236,6 +236,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, def _constructor(self): return Series + @property + def _constructor_expanddim(self): + from pandas.core.frame import DataFrame + return DataFrame + # types @property def _can_hold_na(self): @@ -1047,11 +1052,10 @@ def to_frame(self, name=None): ------- data_frame : DataFrame """ - from pandas.core.frame import DataFrame if name is None: - df = DataFrame(self) + df = self._constructor_expanddim(self) else: - df = DataFrame({name: self}) + df = self._constructor_expanddim({name: self}) return df @@ -1438,7 +1442,7 @@ def searchsorted(self, v, side='left', sorter=None): def append(self, to_append, verify_integrity=False): """ - Concatenate two or more Series. The indexes must not overlap + Concatenate two or more Series. Parameters ---------- @@ -1504,7 +1508,12 @@ def _binop(self, other, func, level=None, fill_value=None): result = func(this_vals, other_vals) name = _maybe_match_name(self, other) - return self._constructor(result, index=new_index).__finalize__(self) + result = self._constructor(result, index=new_index, name=name) + result = result.__finalize__(self) + if name is None: + # When name is None, __finalize__ overwrites current name + result.name = None + return result def combine(self, other, func, fill_value=nan): """ @@ -2139,6 +2148,19 @@ def rename(self, index=None, **kwargs): def reindex(self, index=None, **kwargs): return super(Series, self).reindex(index=index, **kwargs) + @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs) + def fillna(self, value=None, method=None, axis=None, inplace=False, + limit=None, downcast=None, **kwargs): + return super(Series, self).fillna(value=value, method=method, + axis=axis, inplace=inplace, + limit=limit, downcast=downcast, + **kwargs) + + @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) + def shift(self, periods=1, freq=None, axis=0, **kwargs): + return super(Series, self).shift(periods=periods, freq=freq, + axis=axis, **kwargs) + def reindex_axis(self, labels, axis=0, **kwargs): """ for compatibility with higher dims """ if axis != 0: @@ -2494,21 +2516,6 @@ def to_period(self, freq=None, copy=True): return self._constructor(new_values, index=new_index).__finalize__(self) - #------------------------------------------------------------------------------ - # string methods - - def _make_str_accessor(self): - if not com.is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a - # str dtype (GH 9343) - raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - return StringMethods(self) - - str = base.AccessorProperty(StringMethods, _make_str_accessor) - #------------------------------------------------------------------------------ # Datetimelike delegation methods @@ -2532,6 +2539,21 @@ def _make_cat_accessor(self): cat = base.AccessorProperty(CategoricalAccessor, _make_cat_accessor) + def _dir_deletions(self): + return self._accessors + + def _dir_additions(self): + rv = set() + # these accessors are mutually exclusive, so break loop when one exists + for accessor in self._accessors: + try: + getattr(self, accessor) + rv.add(accessor) + break + except AttributeError: + pass + return rv + Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) Series._add_numeric_operations() @@ -2605,8 +2627,9 @@ def _try_cast(arr, take_fast_path): # GH #846 if isinstance(data, (np.ndarray, Index, Series)): - subarr = np.array(data, copy=False) + if dtype is not None: + subarr = np.array(data, copy=False) # possibility of nan -> garbage if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 97f6752fb5851..f4ac0166cf44b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,9 +1,9 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import isnull, _values_from_object +from pandas.core.common import isnull, _values_from_object, is_bool_dtype import pandas.compat as compat -from pandas.util.decorators import Appender +from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib import warnings @@ -27,19 +27,42 @@ def _get_array_list(arr, others): def str_cat(arr, others=None, sep=None, na_rep=None): """ - Concatenate arrays of strings with given separator + Concatenate strings in the Series/Index with given separator. Parameters ---------- - arr : list or array-like - others : list or array, or list of arrays + others : list-like, or list of list-likes + If None, returns str concatenating strings of the Series sep : string or None, default None na_rep : string or None, default None If None, an NA in any array will propagate Returns ------- - concat : array + concat : Series/Index of objects or str + + Examples + -------- + If ``others`` is specified, corresponding values are + concatenated with the separator. Result will be a Series of strings. + + >>> Series(['a', 'b', 'c']).str.cat(['A', 'B', 'C'], sep=',') + 0 a,A + 1 b,B + 2 c,C + dtype: object + + Otherwise, strings in the Series are concatenated. Result will be a string. + + >>> Series(['a', 'b', 'c']).str.cat(sep=',') + 'a,b,c' + + Also, you can pass a list of list-likes. + + >>> Series(['a', 'b']).str.cat([['x', 'y'], ['1', '2']], sep=',') + 0 a,x,1 + 1 b,y,2 + dtype: object """ if sep is None: sep = '' @@ -130,18 +153,17 @@ def g(x): def str_count(arr, pat, flags=0): """ - Count occurrences of pattern in each string + Count occurrences of pattern in each string of the Series/Index. Parameters ---------- - arr : list or array-like pat : string, valid regular expression flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE Returns ------- - counts : arrays + counts : Series/Index of integer values """ regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) @@ -150,7 +172,8 @@ def str_count(arr, pat, flags=0): def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): """ - Check whether given pattern is contained in each string in the array + Return boolean Series/``array`` whether given pattern/regex is + contained in each string in the Series/Index. Parameters ---------- @@ -166,7 +189,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): Returns ------- - Series of boolean values + contained : Series/array of boolean values See Also -------- @@ -197,8 +220,9 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): def str_startswith(arr, pat, na=np.nan): """ - Return boolean array indicating whether each string starts with passed - pattern + Return boolean Series/``array`` indicating whether each string in the + Series/Index starts with passed pattern. Equivalent to + :meth:`str.startswith`. Parameters ---------- @@ -208,7 +232,7 @@ def str_startswith(arr, pat, na=np.nan): Returns ------- - startswith : array (boolean) + startswith : Series/array of boolean values """ f = lambda x: x.startswith(pat) return _na_map(f, arr, na, dtype=bool) @@ -216,8 +240,9 @@ def str_startswith(arr, pat, na=np.nan): def str_endswith(arr, pat, na=np.nan): """ - Return boolean array indicating whether each string ends with passed - pattern + Return boolean Series indicating whether each string in the + Series/Index ends with passed pattern. Equivalent to + :meth:`str.endswith`. Parameters ---------- @@ -227,7 +252,7 @@ def str_endswith(arr, pat, na=np.nan): Returns ------- - endswith : array (boolean) + endswith : Series/array of boolean values """ f = lambda x: x.endswith(pat) return _na_map(f, arr, na, dtype=bool) @@ -235,7 +260,9 @@ def str_endswith(arr, pat, na=np.nan): def str_replace(arr, pat, repl, n=-1, case=True, flags=0): """ - Replace + Replace occurrences of pattern/regex in the Series/Index with + some other string. Equivalent to :meth:`str.replace` or + :func:`re.sub`. Parameters ---------- @@ -252,7 +279,7 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Returns ------- - replaced : array + replaced : Series/Index of objects """ use_re = not case or len(pat) > 1 or flags @@ -272,7 +299,8 @@ def f(x): def str_repeat(arr, repeats): """ - Duplicate each string in the array by indicated number of times + Duplicate each string in the Series/Index by indicated number + of times. Parameters ---------- @@ -281,7 +309,7 @@ def str_repeat(arr, repeats): Returns ------- - repeated : array + repeated : Series/Index of objects """ if np.isscalar(repeats): def rep(x): @@ -305,7 +333,8 @@ def rep(x, r): def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): """ - Deprecated: Find groups in each string using passed regular expression. + Deprecated: Find groups in each string in the Series/Index + using passed regular expression. If as_indexer=True, determine if each string matches a regular expression. Parameters @@ -322,9 +351,9 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): Returns ------- - Series of boolean values + Series/array of boolean values if as_indexer=True - Series of tuples + Series/Index of tuples if as_indexer=False, default but deprecated See Also @@ -359,6 +388,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): if (not as_indexer) and regex.groups > 0: dtype = object + def f(x): m = regex.match(x) if m: @@ -382,7 +412,8 @@ def _get_single_group_name(rx): def str_extract(arr, pat, flags=0): """ - Find groups in each string using passed regular expression + Find groups in each string in the Series using passed regular + expression. Parameters ---------- @@ -435,12 +466,14 @@ def str_extract(arr, pat, flags=0): """ from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index regex = re.compile(pat, flags=flags) # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") empty_row = [np.nan]*regex.groups + def f(x): if not isinstance(x, compat.string_types): return empty_row @@ -449,11 +482,14 @@ def f(x): return [np.nan if item is None else item for item in m.groups()] else: return empty_row + if regex.groups == 1: - result = Series([f(val)[0] for val in arr], - name=_get_single_group_name(regex), - index=arr.index, dtype=object) + result = np.array([f(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) else: + if isinstance(arr, Index): + raise ValueError("only one regex group is supported with Index") + name = None names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] if arr.empty: @@ -463,12 +499,22 @@ def f(x): columns=columns, index=arr.index, dtype=object) - return result + return result, name def str_get_dummies(arr, sep='|'): """ - Split each string by sep and return a frame of dummy/indicator variables. + Split each string in the Series by sep and return a frame of + dummy/indicator variables. + + Parameters + ---------- + sep : string, default "|" + String to split on. + + Returns + ------- + dummies : DataFrame Examples -------- @@ -478,16 +524,22 @@ def str_get_dummies(arr, sep='|'): 1 1 0 0 2 1 0 1 - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + >>> Series(['a|b', np.nan, 'a|c']).str.get_dummies() a b c 0 1 1 0 1 0 0 0 2 1 0 1 - See also ``pd.get_dummies``. - + See Also + -------- + pandas.get_dummies """ from pandas.core.frame import DataFrame + from pandas.core.index import Index + + # GH9980, Index.str does not support get_dummies() as it returns a frame + if isinstance(arr, Index): + raise TypeError("get_dummies is not supported for string methods on Index") # TODO remove this hack? arr = arr.fillna('') @@ -511,7 +563,8 @@ def str_get_dummies(arr, sep='|'): def str_join(arr, sep): """ - Join lists contained as elements in array, a la str.join + Join lists contained as elements in the Series/Index with + passed delimiter. Equivalent to :meth:`str.join`. Parameters ---------- @@ -520,14 +573,15 @@ def str_join(arr, sep): Returns ------- - joined : array + joined : Series/Index of objects """ return _na_map(sep.join, arr) def str_findall(arr, pat, flags=0): """ - Find all occurrences of pattern or regular expression + Find all occurrences of pattern or regular expression in the + Series/Index. Equivalent to :func:`re.findall`. Parameters ---------- @@ -538,7 +592,7 @@ def str_findall(arr, pat, flags=0): Returns ------- - matches : array + matches : Series/Index of lists """ regex = re.compile(pat, flags=flags) return _na_map(regex.findall, arr) @@ -546,8 +600,8 @@ def str_findall(arr, pat, flags=0): def str_find(arr, sub, start=0, end=None, side='left'): """ - Return indexes in each strings where the substring is - fully contained between [start:end]. Return -1 on failure. + Return indexes in each strings in the Series/Index where the + substring is fully contained between [start:end]. Return -1 on failure. Parameters ---------- @@ -562,7 +616,7 @@ def str_find(arr, sub, start=0, end=None, side='left'): Returns ------- - found : array + found : Series/Index of integer values """ if not isinstance(sub, compat.string_types): @@ -584,13 +638,33 @@ def str_find(arr, sub, start=0, end=None, side='left'): return _na_map(f, arr, dtype=int) +def str_index(arr, sub, start=0, end=None, side='left'): + if not isinstance(sub, compat.string_types): + msg = 'expected a string object, not {0}' + raise TypeError(msg.format(type(sub).__name__)) + + if side == 'left': + method = 'index' + elif side == 'right': + method = 'rindex' + else: # pragma: no cover + raise ValueError('Invalid side') + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return _na_map(f, arr, dtype=int) + + def str_pad(arr, width, side='left', fillchar=' '): """ - Pad strings with an additional character + Pad strings in the Series/Index with an additional character to + specified side. Parameters ---------- - arr : list or array-like width : int Minimum width of resulting string; additional characters will be filled with spaces @@ -600,7 +674,7 @@ def str_pad(arr, width, side='left', fillchar=' '): Returns ------- - padded : array + padded : Series/Index of objects """ if not isinstance(fillchar, compat.string_types): @@ -622,33 +696,26 @@ def str_pad(arr, width, side='left', fillchar=' '): return _na_map(f, arr) -def str_split(arr, pat=None, n=None, return_type='series'): +def str_split(arr, pat=None, n=None): """ - Split each string (a la re.split) in array by given pattern, propagating NA - values + Split each string (a la re.split) in the Series/Index by given + pattern, propagating NA values. Equivalent to :meth:`str.split`. Parameters ---------- pat : string, default None String or regular expression to split on. If None, splits on whitespace - n : int, default None (all) - return_type : {'series', 'frame'}, default 'series - If frame, returns a DataFrame (elements are strings) - If series, returns an Series (elements are lists of strings). - - Notes - ----- - Both 0 and -1 will be interpreted as return all splits + n : int, default -1 (all) + None, 0 and -1 will be interpreted as return all splits + expand : bool, default False + * If True, return DataFrame/MultiIndex expanding dimensionality. + * If False, return Series/Index. + return_type : deprecated, use `expand` Returns ------- - split : array + split : Series/Index or DataFrame/MultiIndex of objects """ - from pandas.core.series import Series - from pandas.core.frame import DataFrame - - if return_type not in ('series', 'frame'): - raise ValueError("return_type must be {'series', 'frame'}") if pat is None: if n is None or n == 0: n = -1 @@ -663,16 +730,13 @@ def str_split(arr, pat=None, n=None, return_type='series'): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - if return_type == 'frame': - res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index) - else: - res = _na_map(f, arr) + res = _na_map(f, arr) return res def str_slice(arr, start=None, stop=None, step=None): """ - Slice substrings from each element in array + Slice substrings from each element in the Series/Index Parameters ---------- @@ -682,7 +746,7 @@ def str_slice(arr, start=None, stop=None, step=None): Returns ------- - sliced : array + sliced : Series/Index of objects """ obj = slice(start, stop, step) f = lambda x: x[obj] @@ -691,17 +755,19 @@ def str_slice(arr, start=None, stop=None, step=None): def str_slice_replace(arr, start=None, stop=None, repl=None): """ - Replace a slice of each string with another string. + Replace a slice of each string in the Series/Index with another + string. Parameters ---------- start : int or None stop : int or None repl : str or None + String for replacement Returns ------- - replaced : array + replaced : Series/Index of objects """ if repl is None: repl = '' @@ -721,96 +787,78 @@ def f(x): return _na_map(f, arr) -def str_strip(arr, to_strip=None): - """ - Strip whitespace (including newlines) from each string in the array - - Parameters - ---------- - to_strip : str or unicode - - Returns - ------- - stripped : array - """ - return _na_map(lambda x: x.strip(to_strip), arr) - - -def str_lstrip(arr, to_strip=None): - """ - Strip whitespace (including newlines) from left side of each string in the - array - - Parameters - ---------- - to_strip : str or unicode - - Returns - ------- - stripped : array - """ - return _na_map(lambda x: x.lstrip(to_strip), arr) - - -def str_rstrip(arr, to_strip=None): +def str_strip(arr, to_strip=None, side='both'): """ - Strip whitespace (including newlines) from right side of each string in the - array + Strip whitespace (including newlines) from each string in the + Series/Index. Parameters ---------- to_strip : str or unicode + side : {'left', 'right', 'both'}, default 'both' Returns ------- - stripped : array + stripped : Series/Index of objects """ - return _na_map(lambda x: x.rstrip(to_strip), arr) + if side == 'both': + f = lambda x: x.strip(to_strip) + elif side == 'left': + f = lambda x: x.lstrip(to_strip) + elif side == 'right': + f = lambda x: x.rstrip(to_strip) + else: # pragma: no cover + raise ValueError('Invalid side') + return _na_map(f, arr) def str_wrap(arr, width, **kwargs): - """ - Wrap long strings to be formatted in paragraphs + r""" + Wrap long strings in the Series/Index to be formatted in + paragraphs with length less than a given width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. Parameters ---------- - Same keyword parameters and defaults as :class:`textwrap.TextWrapper` width : int Maximum line-width expand_tabs : bool, optional If true, tab characters will be expanded to spaces (default: True) replace_whitespace : bool, optional - If true, each whitespace character (as defined by string.whitespace) remaining - after tab expansion will be replaced by a single space (default: True) + If true, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True) drop_whitespace : bool, optional - If true, whitespace that, after wrapping, happens to end up at the beginning - or end of a line is dropped (default: True) + If true, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True) break_long_words : bool, optional - If true, then words longer than width will be broken in order to ensure that - no lines are longer than width. If it is false, long words will not be broken, - and some lines may be longer than width. (default: True) + If true, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width. (default: True) break_on_hyphens : bool, optional - If true, wrapping will occur preferably on whitespace and right after hyphens - in compound words, as it is customary in English. If false, only whitespaces - will be considered as potentially good places for line breaks, but you need - to set break_long_words to false if you want truly insecable words. - (default: True) + If true, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words. (default: True) Returns ------- - wrapped : array + wrapped : Series/Index of objects Notes ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with default - settings. To achieve behavior matching R's stringr library str_wrap function, use - the arguments: + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: - expand_tabs = False - replace_whitespace = True - drop_whitespace = True - break_long_words = False - break_on_hyphens = False + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False Examples -------- @@ -827,9 +875,48 @@ def str_wrap(arr, width, **kwargs): return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr) +def str_translate(arr, table, deletechars=None): + """ + Map all characters in the string through the given mapping table. + Equivalent to standard :meth:`str.translate`. Note that the optional + argument deletechars is only valid if you are using python 2. For python 3, + character deletion should be specified via the table argument. + + Parameters + ---------- + table : dict (python 3), str or None (python 2) + In python 3, table is a mapping of Unicode ordinals to Unicode ordinals, + strings, or None. Unmapped characters are left untouched. Characters + mapped to None are deleted. :meth:`str.maketrans` is a helper function + for making translation tables. + In python 2, table is either a string of length 256 or None. If the + table argument is None, no translation is applied and the operation + simply removes the characters in deletechars. :func:`string.maketrans` + is a helper function for making translation tables. + deletechars : str, optional (python 2) + A string of characters to delete. This argument is only valid + in python 2. + + Returns + ------- + translated : Series/Index of objects + """ + if deletechars is None: + f = lambda x: x.translate(table) + else: + from pandas import compat + if compat.PY3: + raise ValueError("deletechars is not a valid argument for " + "str.translate in python 3. You should simply " + "specify character deletions in the table argument") + f = lambda x: x.translate(table, deletechars) + return _na_map(f, arr) + + def str_get(arr, i): """ - Extract element from lists, tuples, or strings in each element in the array + Extract element from lists, tuples, or strings in each element in the + Series/Index. Parameters ---------- @@ -838,7 +925,7 @@ def str_get(arr, i): Returns ------- - items : array + items : Series/Index of objects """ f = lambda x: x[i] if len(x) > i else np.nan return _na_map(f, arr) @@ -846,7 +933,8 @@ def str_get(arr, i): def str_decode(arr, encoding, errors="strict"): """ - Decode character string to unicode using indicated encoding + Decode character string in the Series/Index to unicode + using indicated encoding. Equivalent to :meth:`str.decode`. Parameters ---------- @@ -855,7 +943,7 @@ def str_decode(arr, encoding, errors="strict"): Returns ------- - decoded : array + decoded : Series/Index of objects """ f = lambda x: x.decode(encoding, errors) return _na_map(f, arr) @@ -863,7 +951,8 @@ def str_decode(arr, encoding, errors="strict"): def str_encode(arr, encoding, errors="strict"): """ - Encode character string to some other encoding using indicated encoding + Encode character string in the Series/Index to some other encoding + using indicated encoding. Equivalent to :meth:`str.encode`. Parameters ---------- @@ -872,7 +961,7 @@ def str_encode(arr, encoding, errors="strict"): Returns ------- - encoded : array + encoded : Series/Index of objects """ f = lambda x: x.encode(encoding, errors) return _na_map(f, arr) @@ -926,9 +1015,9 @@ def do_copy(target): class StringMethods(object): """ - Vectorized string functions for Series. NAs stay NA unless handled - otherwise by a particular method. Patterned after Python's string methods, - with some inspiration from R's stringr package. + Vectorized string functions for Series and Index. NAs stay NA unless + handled otherwise by a particular method. Patterned after Python's string + methods, with some inspiration from R's stringr package. Examples -------- @@ -954,29 +1043,132 @@ def __iter__(self): i += 1 g = self.get(i) - def _wrap_result(self, result): + def _wrap_result(self, result, **kwargs): + + # leave as it is to keep extract and get_dummies results + # can be merged to _wrap_result_expand in v0.17 from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index if not hasattr(result, 'ndim'): return result - elif result.ndim == 1: - name = getattr(result, 'name', None) - return Series(result, index=self.series.index, - name=name or self.series.name) + name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name + + if result.ndim == 1: + if isinstance(self.series, Index): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + return Index(result, name=name) + return Series(result, index=self.series.index, name=name) else: assert result.ndim < 3 return DataFrame(result, index=self.series.index) + def _wrap_result_expand(self, result, expand=False): + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + from pandas.core.index import Index, MultiIndex + if not hasattr(result, 'ndim'): + return result + + if isinstance(self.series, Index): + name = getattr(result, 'name', None) + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if hasattr(result, 'dtype') and is_bool_dtype(result): + return result + + if expand: + result = list(result) + return MultiIndex.from_tuples(result, names=name) + else: + return Index(result, name=name) + else: + index = self.series.index + if expand: + cons_row = self.series._constructor + cons = self.series._constructor_expanddim + data = [cons_row(x) for x in result] + return cons(data, index=index) + else: + name = getattr(result, 'name', None) + cons = self.series._constructor + return cons(result, name=name, index=index) + @copy(str_cat) def cat(self, others=None, sep=None, na_rep=None): result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) return self._wrap_result(result) + @deprecate_kwarg('return_type', 'expand', + mapping={'series': False, 'frame': True}) @copy(str_split) - def split(self, pat=None, n=-1, return_type='series'): - result = str_split(self.series, pat, n=n, return_type=return_type) - return self._wrap_result(result) + def split(self, pat=None, n=-1, expand=False): + result = str_split(self.series, pat, n=n) + return self._wrap_result_expand(result, expand=expand) + + _shared_docs['str_partition'] = (""" + Split the string at the %(side)s occurrence of `sep`, and return 3 elements + containing the part before the separator, the separator itself, + and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + pat : string, default whitespace + String to split on. + expand : bool, default True + * If True, return DataFrame/MultiIndex expanding dimensionality. + * If False, return Series/Index. + + Returns + ------- + split : DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + + Examples + -------- + + >>> s = Series(['A_B_C', 'D_E_F', 'X']) + 0 A_B_C + 1 D_E_F + 2 X + dtype: object + + >>> s.str.partition('_') + 0 1 2 + 0 A _ B_C + 1 D _ E_F + 2 X + + >>> s.str.rpartition('_') + 0 1 2 + 0 A_B _ C + 1 D_E _ F + 2 X + """) + @Appender(_shared_docs['str_partition'] % {'side': 'first', + 'return': '3 elements containing the string itself, followed by two empty strings', + 'also': 'rpartition : Split the string at the last occurrence of `sep`'}) + def partition(self, pat=' ', expand=True): + f = lambda x: x.partition(pat) + result = _na_map(f, self.series) + return self._wrap_result_expand(result, expand=expand) + + @Appender(_shared_docs['str_partition'] % {'side': 'last', + 'return': '3 elements containing two empty strings, followed by the string itself', + 'also': 'partition : Split the string at the first occurrence of `sep`'}) + def rpartition(self, pat=' ', expand=True): + f = lambda x: x.rpartition(pat) + result = _na_map(f, self.series) + return self._wrap_result_expand(result, expand=expand) @copy(str_get) def get(self, i): @@ -997,7 +1189,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): result = str_match(self.series, pat, case=case, flags=flags, - na=na, as_indexer=as_indexer) + na=na, as_indexer=as_indexer) return self._wrap_result(result) @copy(str_replace) @@ -1017,7 +1209,8 @@ def pad(self, width, side='left', fillchar=' '): return self._wrap_result(result) _shared_docs['str_pad'] = (""" - Filling %s side of strings with an additional character + Filling %(side)s side of strings in the Series/Index with an + additional character. Equivalent to :meth:`str.%(method)s`. Parameters ---------- @@ -1029,34 +1222,36 @@ def pad(self, width, side='left', fillchar=' '): Returns ------- - filled : array + filled : Series/Index of objects """) - @Appender(_shared_docs['str_pad'] % 'left and right') + @Appender(_shared_docs['str_pad'] % dict(side='left and right', + method='center')) def center(self, width, fillchar=' '): return self.pad(width, side='both', fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % 'right') + @Appender(_shared_docs['str_pad'] % dict(side='right', method='right')) def ljust(self, width, fillchar=' '): return self.pad(width, side='right', fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % 'left') + @Appender(_shared_docs['str_pad'] % dict(side='left', method='left')) def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) def zfill(self, width): """" - Filling left side with 0 + Filling left side of strings in the Series/Index with 0. + Equivalent to :meth:`str.zfill`. Parameters ---------- width : int - Minimum width of resulting string; additional characters will be filled - with 0 + Minimum width of resulting string; additional characters will be + filled with 0 Returns ------- - filled : array + filled : Series/Index of objects """ result = str_pad(self.series, width, side='left', fillchar='0') return self._wrap_result(result) @@ -1081,19 +1276,31 @@ def encode(self, encoding, errors="strict"): result = str_encode(self.series, encoding, errors) return self._wrap_result(result) - @copy(str_strip) + _shared_docs['str_strip'] = (""" + Strip whitespace (including newlines) from each string in the + Series/Index from %(side)s. Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + stripped : Series/Index of objects + """) + + @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', + method='strip')) def strip(self, to_strip=None): - result = str_strip(self.series, to_strip) + result = str_strip(self.series, to_strip, side='both') return self._wrap_result(result) - @copy(str_lstrip) + @Appender(_shared_docs['str_strip'] % dict(side='left side', + method='lstrip')) def lstrip(self, to_strip=None): - result = str_lstrip(self.series, to_strip) + result = str_strip(self.series, to_strip, side='left') return self._wrap_result(result) - @copy(str_rstrip) + @Appender(_shared_docs['str_strip'] % dict(side='right side', + method='rstrip')) def rstrip(self, to_strip=None): - result = str_rstrip(self.series, to_strip) + result = str_strip(self.series, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) @@ -1106,16 +1313,25 @@ def get_dummies(self, sep='|'): result = str_get_dummies(self.series, sep) return self._wrap_result(result) + @copy(str_translate) + def translate(self, table, deletechars=None): + result = str_translate(self.series, table, deletechars) + return self._wrap_result(result) + count = _pat_wrapper(str_count, flags=True) startswith = _pat_wrapper(str_startswith, na=True) endswith = _pat_wrapper(str_endswith, na=True) findall = _pat_wrapper(str_findall, flags=True) - extract = _pat_wrapper(str_extract, flags=True) + + @copy(str_extract) + def extract(self, pat, flags=0): + result, name = str_extract(self.series, pat, flags=flags) + return self._wrap_result(result, name=name) _shared_docs['find'] = (""" - Return %(side)s indexes in each strings where the substring is - fully contained between [start:end]. Return -1 on failure. - Equivalent to standard ``str.%(method)s``. + Return %(side)s indexes in each strings in the Series/Index + where the substring is fully contained between [start:end]. + Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. Parameters ---------- @@ -1128,7 +1344,7 @@ def get_dummies(self, sep='|'): Returns ------- - found : array + found : Series/Index of integer values See Also -------- @@ -1147,46 +1363,107 @@ def rfind(self, sub, start=0, end=None): result = str_find(self.series, sub, start=start, end=end, side='right') return self._wrap_result(result) + def normalize(self, form): + """Return the Unicode normal form for the strings in the Series/Index. + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form + + Returns + ------- + normalized : Series/Index of objects + """ + import unicodedata + f = lambda x: unicodedata.normalize(form, compat.u_safe(x)) + result = _na_map(f, self.series) + return self._wrap_result(result) + + _shared_docs['index'] = (""" + Return %(side)s indexes in each strings where the substring is + fully contained between [start:end]. This is the same as ``str.%(similar)s`` + except instead of returning -1, it raises a ValueError when the substring + is not found. Equivalent to standard ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + + Returns + ------- + found : Series/Index of objects + + See Also + -------- + %(also)s + """) + + @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', + also='rindex : Return highest indexes in each strings')) + def index(self, sub, start=0, end=None): + result = str_index(self.series, sub, start=start, end=end, side='left') + return self._wrap_result(result) + + @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', + also='index : Return lowest indexes in each strings')) + def rindex(self, sub, start=0, end=None): + result = str_index(self.series, sub, start=start, end=end, side='right') + return self._wrap_result(result) + _shared_docs['len'] = (""" - Compute length of each string in array. + Compute length of each string in the Series/Index. Returns ------- - lengths : array + lengths : Series/Index of integer values """) len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int) _shared_docs['casemethods'] = (""" - Convert strings in array to %(type)s. - Equivalent to ``str.%(method)s``. + Convert strings in the Series/Index to %(type)s. + Equivalent to :meth:`str.%(method)s`. Returns ------- - converted : array + converted : Series/Index of objects """) _shared_docs['lower'] = dict(type='lowercase', method='lower') _shared_docs['upper'] = dict(type='uppercase', method='upper') _shared_docs['title'] = dict(type='titlecase', method='title') - _shared_docs['capitalize'] = dict(type='be capitalized', method='capitalize') + _shared_docs['capitalize'] = dict(type='be capitalized', + method='capitalize') _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase') lower = _noarg_wrapper(lambda x: x.lower(), - docstring=_shared_docs['casemethods'] % _shared_docs['lower']) + docstring=_shared_docs['casemethods'] % + _shared_docs['lower']) upper = _noarg_wrapper(lambda x: x.upper(), - docstring=_shared_docs['casemethods'] % _shared_docs['upper']) + docstring=_shared_docs['casemethods'] % + _shared_docs['upper']) title = _noarg_wrapper(lambda x: x.title(), - docstring=_shared_docs['casemethods'] % _shared_docs['title']) + docstring=_shared_docs['casemethods'] % + _shared_docs['title']) capitalize = _noarg_wrapper(lambda x: x.capitalize(), - docstring=_shared_docs['casemethods'] % _shared_docs['capitalize']) + docstring=_shared_docs['casemethods'] % + _shared_docs['capitalize']) swapcase = _noarg_wrapper(lambda x: x.swapcase(), - docstring=_shared_docs['casemethods'] % _shared_docs['swapcase']) + docstring=_shared_docs['casemethods'] % + _shared_docs['swapcase']) _shared_docs['ismethods'] = (""" - Check whether all characters in each string in the array are %(type)s. - Equivalent to ``str.%(method)s``. + Check whether all characters in each string in the Series/Index + are %(type)s. Equivalent to :meth:`str.%(method)s`. Returns ------- - Series of boolean values + is : Series/array of boolean values """) _shared_docs['isalnum'] = dict(type='alphanumeric', method='isalnum') _shared_docs['isalpha'] = dict(type='alphabetic', method='isalpha') @@ -1198,20 +1475,29 @@ def rfind(self, sub, start=0, end=None): _shared_docs['isnumeric'] = dict(type='numeric', method='isnumeric') _shared_docs['isdecimal'] = dict(type='decimal', method='isdecimal') isalnum = _noarg_wrapper(lambda x: x.isalnum(), - docstring=_shared_docs['ismethods'] % _shared_docs['isalnum']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isalnum']) isalpha = _noarg_wrapper(lambda x: x.isalpha(), - docstring=_shared_docs['ismethods'] % _shared_docs['isalpha']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isalpha']) isdigit = _noarg_wrapper(lambda x: x.isdigit(), - docstring=_shared_docs['ismethods'] % _shared_docs['isdigit']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isdigit']) isspace = _noarg_wrapper(lambda x: x.isspace(), - docstring=_shared_docs['ismethods'] % _shared_docs['isspace']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isspace']) islower = _noarg_wrapper(lambda x: x.islower(), - docstring=_shared_docs['ismethods'] % _shared_docs['islower']) + docstring=_shared_docs['ismethods'] % + _shared_docs['islower']) isupper = _noarg_wrapper(lambda x: x.isupper(), - docstring=_shared_docs['ismethods'] % _shared_docs['isupper']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isupper']) istitle = _noarg_wrapper(lambda x: x.istitle(), - docstring=_shared_docs['ismethods'] % _shared_docs['istitle']) + docstring=_shared_docs['ismethods'] % + _shared_docs['istitle']) isnumeric = _noarg_wrapper(lambda x: compat.u_safe(x).isnumeric(), - docstring=_shared_docs['ismethods'] % _shared_docs['isnumeric']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isnumeric']) isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(), - docstring=_shared_docs['ismethods'] % _shared_docs['isdecimal']) + docstring=_shared_docs['ismethods'] % + _shared_docs['isdecimal']) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 8bdcfb44242ff..c4cd788216018 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -211,7 +211,6 @@ cdef class StringHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val char *buf @@ -223,12 +222,9 @@ cdef class StringHashTable(HashTable): buf = util.get_c_string(val) k = kh_get_str(self.table, buf) if k == self.table.n_buckets: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - count += 1 + kh_put_str(self.table, buf, &ret) uniques.append(val) - # return None return uniques.to_array() def factorize(self, ndarray[object] values): @@ -258,7 +254,6 @@ cdef class StringHashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int32HashTable(HashTable): @@ -319,7 +314,6 @@ cdef class Int32HashTable(HashTable): def lookup(self, ndarray[int32_t] values): cdef: Py_ssize_t i, n = len(values) - int ret = 0 int32_t val khiter_t k ndarray[int32_t] locs = np.empty(n, dtype=np.int64) @@ -357,7 +351,6 @@ cdef class Int32HashTable(HashTable): labels[i] = count count += 1 - # return None return reverse, labels cdef class Int64HashTable: #(HashTable): @@ -518,7 +511,6 @@ cdef class Int64HashTable: #(HashTable): def unique(self, ndarray[int64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 ndarray result int64_t val @@ -529,9 +521,8 @@ cdef class Int64HashTable: #(HashTable): val = values[i] k = kh_get_int64(self.table, val) if k == self.table.n_buckets: - k = kh_put_int64(self.table, val, &ret) + kh_put_int64(self.table, val, &ret) uniques.append(val) - count += 1 result = uniques.to_array() @@ -644,7 +635,6 @@ cdef class Float64HashTable(HashTable): def unique(self, ndarray[float64_t] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 float64_t val khiter_t k @@ -657,9 +647,8 @@ cdef class Float64HashTable(HashTable): if val == val: k = kh_get_float64(self.table, val) if k == self.table.n_buckets: - k = kh_put_float64(self.table, val, &ret) + kh_put_float64(self.table, val, &ret) uniques.append(val) - count += 1 elif not seen_na: seen_na = 1 uniques.append(ONAN) @@ -786,7 +775,6 @@ cdef class PyObjectHashTable(HashTable): def unique(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - Py_ssize_t idx, count = 0 int ret = 0 object val ndarray result @@ -800,7 +788,7 @@ cdef class PyObjectHashTable(HashTable): if not _checknan(val): k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: - k = kh_put_pymap(self.table, val, &ret) + kh_put_pymap(self.table, val, &ret) uniques.append(val) elif not seen_na: seen_na = 1 @@ -918,7 +906,7 @@ cdef class Int64Factorizer: cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 @@ -938,7 +926,6 @@ cpdef value_count_int64(ndarray[int64_t] values): cdef: Py_ssize_t i kh_int64_t *table - int ret = 0 int k table = kh_init_int64() @@ -961,7 +948,7 @@ cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): cdef: - int k + khiter_t k Py_ssize_t i, n = len(values) int ret = 0 @@ -983,7 +970,7 @@ cdef build_count_table_object(ndarray[object] values, cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: - Py_ssize_t i = len(values) + Py_ssize_t i kh_pymap_t *table int k @@ -1008,9 +995,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): int count, max_count = 2 int j = -1 # so you can do += int k - Py_ssize_t i, n = len(values) kh_pymap_t *table - int ret = 0 table = kh_init_pymap() build_count_table_object(values, mask, table) @@ -1036,11 +1021,10 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): def mode_int64(ndarray[int64_t] values): cdef: - int val, max_val = 2 + int count, max_count = 2 int j = -1 # so you can do += int k kh_int64_t *table - list uniques = [] table = kh_init_int64() @@ -1049,12 +1033,12 @@ def mode_int64(ndarray[int64_t] values): modes = np.empty(table.n_buckets, dtype=np.int64) for k in range(table.n_buckets): if kh_exist_int64(table, k): - val = table.vals[k] + count = table.vals[k] - if val == max_val: + if count == max_count: j += 1 - elif val > max_val: - max_val = val + elif count > max_count: + max_count = count j = 0 else: continue diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 9ecffb382e151..f1fcc822adeaf 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -13,75 +13,22 @@ from pandas.tools.merge import concat from pandas.core.common import PandasError -_IMPORTS = False -_GOOGLE_API_CLIENT_INSTALLED = False -_GOOGLE_API_CLIENT_VALID_VERSION = False -_GOOGLE_FLAGS_INSTALLED = False -_GOOGLE_FLAGS_VALID_VERSION = False -_HTTPLIB2_INSTALLED = False -_SETUPTOOLS_INSTALLED = False -def _importers(): - # import things we need - # but make this done on a first use basis - - global _IMPORTS - if _IMPORTS: - return - - _IMPORTS = True - - if not compat.PY3: - - global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \ - _GOOGLE_FLAGS_INSTALLED, _GOOGLE_FLAGS_VALID_VERSION, \ - _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED - - try: - import pkg_resources - _SETUPTOOLS_INSTALLED = True - except ImportError: - _SETUPTOOLS_INSTALLED = False - - if _SETUPTOOLS_INSTALLED: - try: - from apiclient.discovery import build - from apiclient.http import MediaFileUpload - from apiclient.errors import HttpError - - from oauth2client.client import OAuth2WebServerFlow - from oauth2client.client import AccessTokenRefreshError - from oauth2client.client import flow_from_clientsecrets - from oauth2client.file import Storage - from oauth2client.tools import run - _GOOGLE_API_CLIENT_INSTALLED=True - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version - - if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2.0': - _GOOGLE_API_CLIENT_VALID_VERSION = True - - except ImportError: - _GOOGLE_API_CLIENT_INSTALLED = False - - - try: - import gflags as flags - _GOOGLE_FLAGS_INSTALLED = True - - _GOOGLE_FLAGS_VERSION = pkg_resources.get_distribution('python-gflags').version +def _check_google_client_version(): + if compat.PY3: + raise NotImplementedError("Google's libraries do not support Python 3 yet") - if LooseVersion(_GOOGLE_FLAGS_VERSION) >= '2.0': - _GOOGLE_FLAGS_VALID_VERSION = True + try: + import pkg_resources - except ImportError: - _GOOGLE_FLAGS_INSTALLED = False + except ImportError: + raise ImportError('Could not import pkg_resources (setuptools).') - try: - import httplib2 - _HTTPLIB2_INSTALLED = True - except ImportError: - _HTTPLIB2_INSTALLED = False + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + if LooseVersion(_GOOGLE_API_CLIENT_VERSION) < '1.2.0': + raise ImportError("pandas requires google-api-python-client >= 1.2.0 for Google " + "BigQuery support, current version " + _GOOGLE_API_CLIENT_VERSION) logger = logging.getLogger('pandas.io.gbq') logger.setLevel(logging.ERROR) @@ -142,6 +89,16 @@ def __init__(self, project_id, reauth=False): self.service = self.get_service(self.credentials) def get_credentials(self): + try: + from oauth2client.client import OAuth2WebServerFlow + from oauth2client.file import Storage + from oauth2client.tools import run_flow, argparser + + except ImportError: + raise ImportError('Could not import Google API Client.') + + _check_google_client_version() + flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com', client_secret='kOc9wMptUtxkcIFbtZCcrEAc', scope='https://www.googleapis.com/auth/bigquery', @@ -151,11 +108,25 @@ def get_credentials(self): credentials = storage.get() if credentials is None or credentials.invalid or self.reauth: - credentials = run(flow, storage) + credentials = run_flow(flow, storage, argparser.parse_args([])) return credentials def get_service(self, credentials): + try: + import httplib2 + + except ImportError: + raise ImportError("pandas requires httplib2 for Google BigQuery support") + + try: + from apiclient.discovery import build + + except ImportError: + raise ImportError('Could not import Google API Client.') + + _check_google_client_version() + http = httplib2.Http() http = credentials.authorize(http) bigquery_service = build('bigquery', 'v2', http=http) @@ -163,6 +134,15 @@ def get_service(self, credentials): return bigquery_service def run_query(self, query): + try: + from apiclient.errors import HttpError + from oauth2client.client import AccessTokenRefreshError + + except ImportError: + raise ImportError('Could not import Google API Client.') + + _check_google_client_version() + job_collection = self.service.jobs() job_data = { 'configuration': { @@ -313,38 +293,6 @@ def _parse_entry(field_value, field_type): return field_value == 'true' return field_value -def _test_imports(): - - _importers() - _GOOGLE_API_CLIENT_INSTALLED - _GOOGLE_API_CLIENT_VALID_VERSION - _GOOGLE_FLAGS_INSTALLED - _GOOGLE_FLAGS_VALID_VERSION - _HTTPLIB2_INSTALLED - _SETUPTOOLS_INSTALLED - - if compat.PY3: - raise NotImplementedError("Google's libraries do not support Python 3 yet") - - if not _SETUPTOOLS_INSTALLED: - raise ImportError('Could not import pkg_resources (setuptools).') - - if not _GOOGLE_API_CLIENT_INSTALLED: - raise ImportError('Could not import Google API Client.') - - if not _GOOGLE_FLAGS_INSTALLED: - raise ImportError('Could not import Google Command Line Flags Module.') - - if not _GOOGLE_API_CLIENT_VALID_VERSION: - raise ImportError("pandas requires google-api-python-client >= 1.2.0 for Google " - "BigQuery support, current version " + _GOOGLE_API_CLIENT_VERSION) - - if not _GOOGLE_FLAGS_VALID_VERSION: - raise ImportError("pandas requires python-gflags >= 2.0.0 for Google " - "BigQuery support, current version " + _GOOGLE_FLAGS_VERSION) - - if not _HTTPLIB2_INSTALLED: - raise ImportError("pandas requires httplib2 for Google BigQuery support") def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False): """Load data from Google BigQuery. @@ -379,7 +327,6 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa """ - _test_imports() if not project_id: raise TypeError("Missing required parameter: project_id") @@ -450,7 +397,6 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, if multiple accounts are used. """ - _test_imports() if not project_id: raise TypeError("Missing required parameter: project_id") diff --git a/pandas/io/html.py b/pandas/io/html.py index 9f5c10ce128d2..b806b5147c4a5 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -19,6 +19,7 @@ raise_with_traceback, binary_type) from pandas.core import common as com from pandas import Series +from pandas.core.common import AbstractMethodError _IMPORTS = False _HAS_BS4 = False @@ -229,7 +230,7 @@ def _text_getter(self, obj): text : str or unicode The text from an individual DOM node. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_td(self, obj): """Return the td elements from a row element. @@ -243,7 +244,7 @@ def _parse_td(self, obj): columns : list of node-like These are the elements of each row, i.e., the columns. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_tables(self, doc, match, attrs): """Return all tables from the parsed DOM. @@ -270,7 +271,7 @@ def _parse_tables(self, doc, match, attrs): tables : list of node-like A list of elements to be parsed into raw data. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_tr(self, table): """Return the list of row elements from the parsed table element. @@ -285,7 +286,7 @@ def _parse_tr(self, table): rows : list of node-like A list row elements of a table, usually or ... element. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_tbody(self, table): """Return the body of the table. @@ -315,7 +316,7 @@ def _parse_tbody(self, table): tbody : node-like A ... element. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_tfoot(self, table): """Return the footer of the table if any. @@ -330,7 +331,7 @@ def _parse_tfoot(self, table): tfoot : node-like A ... element. """ - raise NotImplementedError + raise AbstractMethodError(self) def _build_doc(self): """Return a tree-like object that can be used to iterate over the DOM. @@ -339,7 +340,7 @@ def _build_doc(self): ------- obj : tree-like """ - raise NotImplementedError + raise AbstractMethodError(self) def _build_table(self, table): header = self._parse_raw_thead(table) diff --git a/pandas/io/json.py b/pandas/io/json.py index 9e8ef74545ef2..0659e34c3f27b 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -11,6 +11,7 @@ from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime from pandas.io.common import get_filepath_or_buffer +from pandas.core.common import AbstractMethodError import pandas.core.common as com loads = _json.loads @@ -33,7 +34,7 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler).write() else: - raise NotImplementedError + raise NotImplementedError("'obj' should be a Series or a DataFrame") if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: @@ -64,7 +65,7 @@ def __init__(self, obj, orient, date_format, double_precision, self._format_axes() def _format_axes(self): - raise NotImplementedError + raise AbstractMethodError(self) def write(self): return dumps( @@ -282,7 +283,7 @@ def _convert_axes(self): setattr(self.obj, axis, new_axis) def _try_convert_types(self): - raise NotImplementedError + raise AbstractMethodError(self) def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): @@ -395,7 +396,7 @@ def _try_convert_to_date(self, data): return data, False def _try_convert_dates(self): - raise NotImplementedError + raise AbstractMethodError(self) class SeriesParser(Parser): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index b3e2e16af54c2..75ca44fd1ef3e 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -65,26 +65,7 @@ # until we can pass this into our conversion functions, # this is pretty hacky compressor = None -_IMPORTS = False -_BLOSC = False -def _importers(): - # import things we need - # but make this done on a first use basis - - global _IMPORTS - if _IMPORTS: - return - - _IMPORTS = True - - global _BLOSC - import zlib - try: - import blosc - _BLOSC = True - except: - pass def to_msgpack(path_or_buf, *args, **kwargs): """ @@ -103,7 +84,6 @@ def to_msgpack(path_or_buf, *args, **kwargs): compress : type of compressor (zlib or blosc), default to None (no compression) """ - _importers() global compressor compressor = kwargs.pop('compress', None) append = kwargs.pop('append', None) @@ -146,7 +126,6 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): obj : type of object stored in file """ - _importers() path_or_buf, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) @@ -232,9 +211,10 @@ def convert(values): # convert to a bytes array v = v.tostring() + import zlib return zlib.compress(v) - elif compressor == 'blosc' and _BLOSC: + elif compressor == 'blosc': # return string arrays like they are if dtype == np.object_: @@ -242,6 +222,7 @@ def convert(values): # convert to a bytes array v = v.tostring() + import blosc return blosc.compress(v, typesize=dtype.itemsize) # ndarray (on original dtype) @@ -253,23 +234,20 @@ def unconvert(values, dtype, compress=None): if dtype == np.object_: return np.array(values, dtype=object) - if compress == 'zlib': + values = values.encode('latin1') + if compress == 'zlib': + import zlib values = zlib.decompress(values) return np.frombuffer(values, dtype=dtype) elif compress == 'blosc': - - if not _BLOSC: - raise Exception("cannot uncompress w/o blosc") - - # decompress + import blosc values = blosc.decompress(values) - return np.frombuffer(values, dtype=dtype) # from a string - return np.fromstring(values.encode('latin1'), dtype=dtype) + return np.fromstring(values, dtype=dtype) def encode(obj): @@ -285,7 +263,8 @@ def encode(obj): 'name': getattr(obj, 'name', None), 'freq': getattr(obj, 'freqstr', None), 'dtype': obj.dtype.num, - 'data': convert(obj.asi8)} + 'data': convert(obj.asi8), + 'compress': compressor} elif isinstance(obj, DatetimeIndex): tz = getattr(obj, 'tz', None) @@ -299,19 +278,22 @@ def encode(obj): 'dtype': obj.dtype.num, 'data': convert(obj.asi8), 'freq': getattr(obj, 'freqstr', None), - 'tz': tz} + 'tz': tz, + 'compress': compressor} elif isinstance(obj, MultiIndex): return {'typ': 'multi_index', 'klass': obj.__class__.__name__, 'names': getattr(obj, 'names', None), 'dtype': obj.dtype.num, - 'data': convert(obj.values)} + 'data': convert(obj.values), + 'compress': compressor} else: return {'typ': 'index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), 'dtype': obj.dtype.num, - 'data': convert(obj.values)} + 'data': convert(obj.values), + 'compress': compressor} elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py old mode 100644 new mode 100755 index 637612d5fb09d..1ca396935ae78 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -14,6 +14,7 @@ from pandas.core.frame import DataFrame import datetime import pandas.core.common as com +from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import get_filepath_or_buffer @@ -55,8 +56,11 @@ class ParserWarning(Warning): dtype : Type name or dict of column -> type Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') -compression : {'gzip', 'bz2', None}, default None - For on-the-fly decompression of on-disk data +compression : {'gzip', 'bz2', 'infer', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use gzip or + bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', + respectively, and no decompression otherwise. Set to None for no + decompression. dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details @@ -294,7 +298,7 @@ def _read(filepath_or_buffer, kwds): 'verbose': False, 'encoding': None, 'squeeze': False, - 'compression': None, + 'compression': 'infer', 'mangle_dupe_cols': True, 'tupleize_cols': False, 'infer_datetime_format': False, @@ -334,7 +338,7 @@ def _make_parser_function(name, sep=','): def parser_f(filepath_or_buffer, sep=sep, dialect=None, - compression=None, + compression='infer', doublequote=True, escapechar=None, @@ -652,6 +656,8 @@ def _clean_options(self, options, engine): # really delete this one keep_default_na = result.pop('keep_default_na') + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] @@ -705,7 +711,7 @@ def _make_engine(self, engine='c'): self._engine = klass(self.f, **self.options) def _failover_to_python(self): - raise NotImplementedError + raise AbstractMethodError(self) def read(self, nrows=None): if nrows is not None: @@ -991,7 +997,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: values = lib.map_infer(values, conv_f) except ValueError: - mask = lib.ismember(values, na_values).view(np.uin8) + mask = lib.ismember(values, na_values).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) coerce_type = False @@ -1314,6 +1320,7 @@ def _wrap_compressed(f, compression, encoding=None): """ compression = compression.lower() encoding = encoding or get_option('display.encoding') + if compression == 'gzip': import gzip @@ -1386,6 +1393,17 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] + if self.compression == 'infer': + if isinstance(f, compat.string_types): + if f.endswith('.gz'): + self.compression = 'gzip' + elif f.endswith('.bz2'): + self.compression = 'bz2' + else: + self.compression = None + else: + self.compression = None + if isinstance(f, compat.string_types): f = com._get_handle(f, 'r', encoding=self.encoding, compression=self.compression) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 458a245da6bdb..4cbc7aeaa3df7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3453,6 +3453,10 @@ def get_blk_items(mgr, blocks): def process_axes(self, obj, columns=None): """ process axes filters """ + # make a copy to avoid side effects + if columns is not None: + columns = list(columns) + # make sure to include levels if we have them if columns is not None and self.is_multi_index: for n in self.levels: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 117d7b4a9ceaa..ad88d74a5aa91 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -83,14 +83,14 @@ def _handle_date_column(col, format=None): return to_datetime(col, **format) else: if format in ['D', 's', 'ms', 'us', 'ns']: - return to_datetime(col, coerce=True, unit=format) + return to_datetime(col, coerce=True, unit=format, utc=True) elif (issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer)): # parse dates as timestamp format = 's' if format is None else format - return to_datetime(col, coerce=True, unit=format) + return to_datetime(col, coerce=True, unit=format, utc=True) else: - return to_datetime(col, coerce=True, format=format) + return to_datetime(col, coerce=True, format=format, utc=True) def _parse_date_columns(data_frame, parse_dates): @@ -318,6 +318,10 @@ def read_sql_table(table_name, con, schema=None, index_col=None, ------- DataFrame + Notes + ----- + Any datetime values with time zone information will be converted to UTC + See also -------- read_sql_query : Read SQL query into a DataFrame. @@ -390,6 +394,11 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, ------- DataFrame + Notes + ----- + Any datetime values with time zone information parsed via the `parse_dates` + parameter will be converted to UTC + See also -------- read_sql_table : Read SQL database table into a DataFrame @@ -451,7 +460,8 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, This function is a convenience wrapper around ``read_sql_table`` and ``read_sql_query`` (and for backward compatibility) and will delegate to the specific function depending on the provided input (database - table name or sql query). + table name or sql query). The delegated function might have more specific + notes about their functionality not listed here. See also -------- @@ -531,7 +541,8 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', if isinstance(frame, Series): frame = frame.to_frame() elif not isinstance(frame, DataFrame): - raise NotImplementedError + raise NotImplementedError("'frame' argument should be either a " + "Series or a DataFrame") pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, index_label=index_label, schema=schema, @@ -1434,7 +1445,8 @@ def __init__(self, con, flavor, is_cursor=False): if flavor is None: flavor = 'sqlite' if flavor not in ['sqlite', 'mysql']: - raise NotImplementedError + raise NotImplementedError("flavors other than SQLite and MySQL " + "are not supported") else: self.flavor = flavor diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7dd32fd00a4d2..eecc225d06beb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1626,7 +1626,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(column.values) + itemsize = max_len_string_array(com._ensure_object(column.values)) return chr(max(itemsize, 1)) elif dtype == np.float64: return chr(255) @@ -1664,7 +1664,7 @@ def _dtype_to_default_stata_fmt(dtype, column): if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Writing general object arrays is not supported') - itemsize = max_len_string_array(column.values) + itemsize = max_len_string_array(com._ensure_object(column.values)) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1885,6 +1885,8 @@ def _prepare_pandas(self, data): #NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io + data = data.copy() + if self._write_index: data = data.reset_index() @@ -2013,7 +2015,7 @@ def _write_variable_labels(self, labels=None): self._write(_pad_bytes("", 81)) def _prepare_data(self): - data = self.data.copy() + data = self.data typlist = self.typlist convert_dates = self._convert_dates # 1. Convert dates diff --git a/pandas/io/tests/data/test1.csv.bz2 b/pandas/io/tests/data/test1.csv.bz2 new file mode 100644 index 0000000000000..f96f26a8e7419 Binary files /dev/null and b/pandas/io/tests/data/test1.csv.bz2 differ diff --git a/pandas/io/tests/data/test1.csv.gz b/pandas/io/tests/data/test1.csv.gz new file mode 100644 index 0000000000000..1336db6e2af7e Binary files /dev/null and b/pandas/io/tests/data/test1.csv.gz differ diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index ad6f071d738ff..93d55c654de90 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -336,6 +336,28 @@ def test_empty_field_eof(self): 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) + # GH5664 + a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], + columns=list('abcd'), + index=[1, 1]) + c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], + [8, 9, 10, 11], [13, 14, nan, nan]], + columns=list('abcd'), + index=[0, 5, 7, 12]) + + for _ in range(100): + df = read_csv(StringIO('a,b\nc\n'), skiprows=0, + names=['a'], engine='c') + assert_frame_equal(df, a) + + df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2), + names=list("abcd"), engine='c') + assert_frame_equal(df, b) + + df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), + names=list('abcd'), engine='c') + assert_frame_equal(df, c) def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 70a25a45c0ad4..63ed26ea7d931 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -33,7 +33,7 @@ def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in compat.iteritems(obj))) n_all_nan_cols = all_nan_cols.sum() - valid_warnings = pd.Series([wng for wng in wngs if isinstance(wng, cls)]) + valid_warnings = pd.Series([wng for wng in wngs if wng.category == cls]) assert_equal(len(valid_warnings), n_all_nan_cols) failed_symbols = all_nan_cols[all_nan_cols].index msgs = valid_warnings.map(lambda x: x.message) @@ -79,7 +79,7 @@ def test_get_goog_volume(self): for locale in self.locales: with tm.set_locale(locale): df = web.get_data_google('GOOG').sort_index() - self.assertEqual(df.Volume.ix['OCT-08-2010'], 2863473) + self.assertEqual(df.Volume.ix['JAN-02-2015'], 1446662) @network def test_get_multi1(self): @@ -87,10 +87,10 @@ def test_get_multi1(self): sl = ['AAPL', 'AMZN', 'GOOG'] with tm.set_locale(locale): pan = web.get_data_google(sl, '2012') - ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + ts = pan.Close.GOOG.index[pan.Close.AAPL < pan.Close.GOOG] if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and hasattr(pan.Close, 'AAPL')): - self.assertEqual(ts[0].dayofyear, 96) + self.assertEqual(ts[0].dayofyear, 3) else: self.assertRaises(AttributeError, lambda: pan.Close) @@ -105,6 +105,7 @@ def test_get_multi_all_invalid(self): sl = ['INVALID', 'INVALID2', 'INVALID3'] self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') + @network def test_get_multi2(self): with warnings.catch_warnings(record=True) as w: for locale in self.locales: @@ -135,7 +136,7 @@ def test_dtypes(self): def test_unicode_date(self): #GH8967 data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertEquals(data.index.name, 'Date') + self.assertEqual(data.index.name, 'Date') class TestYahoo(tm.TestCase): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 699d1212556cc..768aa40696cbc 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1132,31 +1132,29 @@ def roundtrip(df, header=True, parser_hdr=0): nrows = 5 ncols = 3 - - for i in range(1, 4): # row multindex upto nlevel=3 - for j in range(1, 4): # col "" - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - res = roundtrip(df) - # shape - self.assertEqual(res.shape, (nrows, ncols + i)) - - # no nans - for r in range(len(res.index)): - for c in range(len(res.columns)): - self.assertTrue(res.ix[r, c] is not np.nan) - - for i in range(1, 4): # row multindex upto nlevel=3 - for j in range(1, 4): # col "" - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - res = roundtrip(df, False) - # shape - self.assertEqual(res.shape, ( - nrows - 1, ncols + i)) # first row taken as columns - - # no nans - for r in range(len(res.index)): - for c in range(len(res.columns)): - self.assertTrue(res.ix[r, c] is not np.nan) + for use_headers in (True, False): + for i in range(1, 4): # row multindex upto nlevel=3 + for j in range(1, 4): # col "" + df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) + + #this if will be removed once multi column excel writing + #is implemented for now fixing #9794 + if j>1: + with tm.assertRaises(NotImplementedError): + res = roundtrip(df, use_headers) + else: + res = roundtrip(df, use_headers) + + if use_headers: + self.assertEqual(res.shape, (nrows, ncols + i)) + else: + # first row taken as columns + self.assertEqual(res.shape, (nrows - 1, ncols + i)) + + # no nans + for r in range(len(res.index)): + for c in range(len(res.columns)): + self.assertTrue(res.ix[r, c] is not np.nan) res = roundtrip(DataFrame([0])) self.assertEqual(res.shape, (1, 1)) @@ -1394,6 +1392,29 @@ class XlwtTests(ExcelWriterBase, tm.TestCase): engine_name = 'xlwt' check_skip = staticmethod(_skip_if_no_xlwt) + def test_excel_raise_not_implemented_error_on_multiindex_columns(self): + _skip_if_no_xlwt() + #MultiIndex as columns is not yet implemented 9794 + cols = pd.MultiIndex.from_tuples([('site',''), + ('2014','height'), + ('2014','weight')]) + df = pd.DataFrame(np.random.randn(10,3), columns=cols) + with tm.assertRaises(NotImplementedError): + with ensure_clean(self.ext) as path: + df.to_excel(path, index=False) + + def test_excel_multiindex_index(self): + _skip_if_no_xlwt() + #MultiIndex as index works so assert no error #9794 + cols = pd.MultiIndex.from_tuples([('site',''), + ('2014','height'), + ('2014','weight')]) + df = pd.DataFrame(np.random.randn(3,10), index=cols) + with ensure_clean(self.ext) as path: + df.to_excel(path, index=False) + + + def test_to_excel_styleconverter(self): _skip_if_no_xlwt() diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 2f79cc8ba1826..5417842d3f863 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -12,6 +12,9 @@ import numpy as np +from distutils.version import LooseVersion +from pandas import compat + from pandas import NaT from pandas.compat import u from pandas.core.frame import DataFrame @@ -22,6 +25,12 @@ VERSION = platform.python_version() +_IMPORTS = False +_GOOGLE_API_CLIENT_INSTALLED = False +_GOOGLE_API_CLIENT_VALID_VERSION = False +_HTTPLIB2_INSTALLED = False +_SETUPTOOLS_INSTALLED = False + def missing_bq(): try: subprocess.call('bq') @@ -29,9 +38,64 @@ def missing_bq(): except OSError: return True +def _test_imports(): + if not compat.PY3: + + global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \ + _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED + + try: + import pkg_resources + _SETUPTOOLS_INSTALLED = True + except ImportError: + _SETUPTOOLS_INSTALLED = False + + if _SETUPTOOLS_INSTALLED: + try: + from apiclient.discovery import build + from apiclient.errors import HttpError + + from oauth2client.client import OAuth2WebServerFlow + from oauth2client.client import AccessTokenRefreshError + + from oauth2client.file import Storage + from oauth2client.tools import run_flow + _GOOGLE_API_CLIENT_INSTALLED=True + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + + if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2.0': + _GOOGLE_API_CLIENT_VALID_VERSION = True + + except ImportError: + _GOOGLE_API_CLIENT_INSTALLED = False + + + try: + import httplib2 + _HTTPLIB2_INSTALLED = True + except ImportError: + _HTTPLIB2_INSTALLED = False + + + if compat.PY3: + raise NotImplementedError("Google's libraries do not support Python 3 yet") + + if not _SETUPTOOLS_INSTALLED: + raise ImportError('Could not import pkg_resources (setuptools).') + + if not _GOOGLE_API_CLIENT_INSTALLED: + raise ImportError('Could not import Google API Client.') + + if not _GOOGLE_API_CLIENT_VALID_VERSION: + raise ImportError("pandas requires google-api-python-client >= 1.2.0 for Google " + "BigQuery support, current version " + _GOOGLE_API_CLIENT_VERSION) + + if not _HTTPLIB2_INSTALLED: + raise ImportError("pandas requires httplib2 for Google BigQuery support") + def test_requirements(): try: - gbq._test_imports() + _test_imports() except (ImportError, NotImplementedError) as import_exception: raise nose.SkipTest(import_exception) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 1e8ce7afa9492..26fae0717f956 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -324,12 +324,14 @@ def test_frame_to_json_except(self): def test_frame_empty(self): df = DataFrame(columns=['jim', 'joe']) self.assertFalse(df._is_mixed_type) - assert_frame_equal(read_json(df.to_json()), df) + assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df) + def test_frame_empty_mixedtype(self): # mixed type + df = DataFrame(columns=['jim', 'joe']) df['joe'] = df['joe'].astype('i8') self.assertTrue(df._is_mixed_type) - assert_frame_equal(read_json(df.to_json()), df) + assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df) def test_v12_compat(self): df = DataFrame( diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 9633f567ab098..92e0d7ba1a338 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -446,6 +446,45 @@ def test_sparse_panel(self): check_panel_type=True) +class TestCompression(TestPackers): + """See https://github.com/pydata/pandas/pull/9783 + """ + + def setUp(self): + super(TestCompression, self).setUp() + data = { + 'A': np.arange(1000, dtype=np.float64), + 'B': np.arange(1000, dtype=np.int32), + 'C': list(100 * 'abcdefghij'), + 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), + 'E': [datetime.timedelta(days=x) for x in range(1000)], + } + self.frame = { + 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), + 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), + 'mixed': DataFrame(data), + } + + def test_plain(self): + i_rec = self.encode_decode(self.frame) + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + def test_compression_zlib(self): + i_rec = self.encode_decode(self.frame, compress='zlib') + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + def test_compression_blosc(self): + try: + import blosc + except ImportError: + raise nose.SkipTest('no blosc') + i_rec = self.encode_decode(self.frame, compress='blosc') + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py old mode 100644 new mode 100755 index 35530a7f5e07f..48d625744c787 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -17,6 +17,7 @@ from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) + from pandas.io.common import URLError import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, @@ -272,7 +273,7 @@ def test_squeeze(self): b,2 c,3 """ - expected = Series([1, 2, 3], ['a', 'b', 'c']) + expected = Series([1, 2, 3], index=Index(['a', 'b', 'c'], name=0)) result = self.read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) tm.assert_isinstance(result, Series) @@ -520,6 +521,11 @@ def test_usecols_index_col_False(self): df = self.read_csv(StringIO(s_malformed), usecols=cols, index_col=False) tm.assert_frame_equal(expected, df) + def test_index_col_is_True(self): + # Issue 9798 + self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), + index_col=True) + def test_converter_index_col_bug(self): # 1835 data = "A;B\n1;2\n3;4" @@ -839,6 +845,28 @@ def test_deep_skiprows(self): condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) + def test_skiprows_blank(self): + # GH 9832 + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) + def test_detect_string_na(self): data = """A,B foo,bar @@ -954,8 +982,8 @@ def test_yy_format(self): parse_dates=[['date', 'time']]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)]).asobject - idx.name = 'date_time' + datetime(2009, 3, 31, 8, 30, 0)], + dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp) @@ -963,8 +991,8 @@ def test_yy_format(self): parse_dates=[[0, 1]]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)]).asobject - idx.name = 'date_time' + datetime(2009, 3, 31, 8, 30, 0)], + dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp) @@ -1071,6 +1099,21 @@ def test_read_csv_no_index_name(self): self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) tm.assert_frame_equal(df, df2) + def test_read_csv_infer_compression(self): + # GH 9770 + expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) + + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', open(self.csv1)] + + for f in inputs: + df = self.read_csv(f, index_col=0, parse_dates=True, + compression='infer') + + tm.assert_frame_equal(expected, df) + + inputs[3].close() + def test_read_table_unicode(self): fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) df1 = read_table(fin, sep=";", encoding="utf-8", header=None) @@ -2231,6 +2274,26 @@ def test_nrows_and_chunksize_raises_notimplemented(self): self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), nrows=10, chunksize=5) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + def test_chunk_begins_with_newline_whitespace(self): + # GH 10022 + data = '\n hello\nworld\n' + result = self.read_csv(StringIO(data), header=None) + self.assertEqual(len(result), 2) + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): @@ -2984,6 +3047,25 @@ def test_variable_width_unicode(self): tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')), header=None, encoding='utf8')) + def test_convert_to_nd_arrays(self): + #GH 9266 + with tm.ensure_clean('test.txt') as path: + with open(path,'w') as f: + f.write( + """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 \n""" + + """1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" + ) + + result = pd.read_fwf('test.txt', colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], + names=['time','pri','pgn','dst','src','data'], + converters={'pgn':lambda x: int(x,16), + 'src':lambda x: int(x,16), + 'dst':lambda x: int(x,16), + 'data':lambda x: len(x.split(' '))}, + index_col='time') + self.assertEqual(result['dst'].dtype,np.uint8) + + class TestCParserHighMemory(ParserTests, tm.TestCase): @@ -3068,17 +3150,17 @@ def test_skiprows_lineterminator(self): expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], ['2007/01/01', '02:00', 0.2141, 'M', 'O'], ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', + columns=['date', 'time', 'var', 'flag', 'oflag']) # test with the three default lineterminators LF, CR and CRLF df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r')), + df = self.read_csv(StringIO(data.replace('\n', '\r')), skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) @@ -3271,6 +3353,25 @@ def test_buffer_overflow(self): except Exception as cperr: self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), lineterminator='\n', + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + class TestCParserLowMemory(ParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3692,6 +3793,25 @@ def test_buffer_overflow(self): except Exception as cperr: self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + def test_single_char_leading_whitespace(self): + # GH 9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn' : list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), lineterminator='\n', + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + class TestMiscellaneous(tm.TestCase): # for tests that don't fit into any of the other classes, e.g. those that diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index acdc991c92efe..7d9c3c051344f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -156,50 +156,51 @@ def tearDown(self): pass def test_factory_fun(self): + path = create_tempfile(self.path) try: - with get_store(self.path) as tbl: + with get_store(path) as tbl: raise ValueError('blah') except ValueError: pass finally: - safe_remove(self.path) + safe_remove(path) try: - with get_store(self.path) as tbl: + with get_store(path) as tbl: tbl['a'] = tm.makeDataFrame() - with get_store(self.path) as tbl: + with get_store(path) as tbl: self.assertEqual(len(tbl), 1) self.assertEqual(type(tbl['a']), DataFrame) finally: safe_remove(self.path) def test_context(self): + path = create_tempfile(self.path) try: - with HDFStore(self.path) as tbl: + with HDFStore(path) as tbl: raise ValueError('blah') except ValueError: pass finally: - safe_remove(self.path) + safe_remove(path) try: - with HDFStore(self.path) as tbl: + with HDFStore(path) as tbl: tbl['a'] = tm.makeDataFrame() - with HDFStore(self.path) as tbl: + with HDFStore(path) as tbl: self.assertEqual(len(tbl), 1) self.assertEqual(type(tbl['a']), DataFrame) finally: - safe_remove(self.path) + safe_remove(path) def test_conv_read_write(self): - + path = create_tempfile(self.path) try: - def roundtrip(key, obj,**kwargs): - obj.to_hdf(self.path, key,**kwargs) - return read_hdf(self.path, key) + obj.to_hdf(path, key,**kwargs) + return read_hdf(path, key) o = tm.makeTimeSeries() assert_series_equal(o, roundtrip('series',o)) @@ -215,12 +216,12 @@ def roundtrip(key, obj,**kwargs): # table df = DataFrame(dict(A=lrange(5), B=lrange(5))) - df.to_hdf(self.path,'table',append=True) - result = read_hdf(self.path, 'table', where = ['index>2']) + df.to_hdf(path,'table',append=True) + result = read_hdf(path, 'table', where = ['index>2']) assert_frame_equal(df[df.index>2],result) finally: - safe_remove(self.path) + safe_remove(path) def test_long_strings(self): @@ -1593,9 +1594,10 @@ def make_index(names=None): # series _maybe_remove(store, 's') - s = Series(np.zeros(12), index=make_index(['date',None,None])) + s = Series(np.zeros(12), index=make_index(['date', None, None])) store.append('s',s) - tm.assert_series_equal(store.select('s'),s) + xp = Series(np.zeros(12), index=make_index(['date', 'level_1', 'level_2'])) + tm.assert_series_equal(store.select('s'), xp) # dup with column _maybe_remove(store, 'df') @@ -3612,7 +3614,7 @@ def test_frame_select_complex(self): # invert ok for filters result = store.select('df', "~(columns=['A','B'])") - expected = df.loc[:,df.columns-['A','B']] + expected = df.loc[:,df.columns.difference(['A','B'])] tm.assert_frame_equal(result, expected) # in @@ -4328,13 +4330,14 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): df = tm.makeDataFrame() try: - st = HDFStore(self.path) + path = create_tempfile(self.path) + st = HDFStore(path) st.append('df', df, data_columns = ['A']) st.close() - do_copy(f = self.path) - do_copy(f = self.path, propindexes = False) + do_copy(f = path) + do_copy(f = path, propindexes = False) finally: - safe_remove(self.path) + safe_remove(path) def test_legacy_table_write(self): raise nose.SkipTest("cannot write legacy tables") @@ -4584,22 +4587,59 @@ def test_duplicate_column_name(self): with ensure_clean_path(self.path) as path: self.assertRaises(ValueError, df.to_hdf, path, 'df', format='fixed') + df.to_hdf(path, 'df', format='table') + other = read_hdf(path, 'df') + + tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) + + def test_round_trip_equals(self): + # GH 9330 + df = DataFrame({"B": [1,2], "A": ["x","y"]}) + + with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) def test_preserve_timedeltaindex_type(self): - # GH9635 + # GH9635 # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10,5))) df.index = timedelta_range(start='0s',periods=10,freq='1s',name='example') with ensure_clean_store(self.path) as store: - + store['df'] = df assert_frame_equal(store['df'], df) + def test_colums_multiindex_modified(self): + # BUG: 7212 + # read_hdf store.select modified the passed columns parameters + # when multi-indexed. + + df = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + df.index.name = 'letters' + df = df.set_index(keys='E', append=True) + + data_columns = df.index.names+df.columns.tolist() + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', + mode='a', + append=True, + data_columns=data_columns, + index=False) + cols2load = list('BCD') + cols2load_original = list(cols2load) + df_loaded = read_hdf(path, 'df', columns=cols2load) + self.assertTrue(cols2load_original == cols2load) + def _test_sort(obj): if isinstance(obj, DataFrame): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 2db6f1e104770..9576f80696350 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -29,7 +29,7 @@ from datetime import datetime, date, time from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat -from pandas import date_range, to_datetime, to_timedelta +from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types from pandas.core.datetools import format as date_format @@ -100,6 +100,7 @@ 'postgresql': """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TIMESTAMP, + "DateColWithTz" TIMESTAMP WITH TIME ZONE, "IntDateCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, @@ -109,18 +110,36 @@ )""" }, 'insert_test_types': { - 'sqlite': """ + 'sqlite': { + 'query': """ INSERT INTO types_test_data VALUES(?, ?, ?, ?, ?, ?, ?, ?) """, - 'mysql': """ + 'fields': ( + 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, + 'mysql': { + 'query': """ INSERT INTO types_test_data VALUES("%s", %s, %s, %s, %s, %s, %s, %s) """, - 'postgresql': """ + 'fields': ( + 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, + 'postgresql': { + 'query': """ INSERT INTO types_test_data - VALUES(%s, %s, %s, %s, %s, %s, %s, %s) - """ + VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + 'fields': ( + 'TextCol', 'DateCol', 'DateColWithTz', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, }, 'read_parameters': { 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?", @@ -218,11 +237,36 @@ def _load_raw_sql(self): self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) ins = SQL_STRINGS['insert_test_types'][self.flavor] - data = [( - 'first', '2000-01-03 00:00:00', 535852800, 10.10, 1, False, 1, False), - ('first', '2000-01-04 00:00:00', 1356998400, 10.10, 1, False, None, None)] + data = [ + { + 'TextCol': 'first', + 'DateCol': '2000-01-03 00:00:00', + 'DateColWithTz': '2000-01-01 00:00:00-08:00', + 'IntDateCol': 535852800, + 'FloatCol': 10.10, + 'IntCol': 1, + 'BoolCol': False, + 'IntColWithNull': 1, + 'BoolColWithNull': False, + }, + { + 'TextCol': 'first', + 'DateCol': '2000-01-04 00:00:00', + 'DateColWithTz': '2000-06-01 00:00:00-07:00', + 'IntDateCol': 1356998400, + 'FloatCol': 10.10, + 'IntCol': 1, + 'BoolCol': False, + 'IntColWithNull': None, + 'BoolColWithNull': None, + }, + ] + for d in data: - self._get_exec().execute(ins, d) + self._get_exec().execute( + ins['query'], + [d[field] for field in ins['fields']] + ) def _count_rows(self, table_name): result = self._get_exec().execute( @@ -1212,10 +1256,14 @@ def test_transactions(self): self._transaction_test() def test_get_schema_create_table(self): - self._load_test2_data() + # Use a dataframe without a bool column, since MySQL converts bool to + # TINYINT (which read_sql_table returns as an int and causes a dtype + # mismatch) + + self._load_test3_data() tbl = 'test_get_schema_create_table' - create_sql = sql.get_schema(self.test_frame2, tbl, con=self.conn) - blank_test_df = self.test_frame2.iloc[:0] + create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn) + blank_test_df = self.test_frame3.iloc[:0] self.drop_table(tbl) self.conn.execute(create_sql) @@ -1279,19 +1327,19 @@ def test_double_precision(self): 'i64':Series([5,], dtype='int64'), }) - df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace', + df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace', dtype={'f64_as_f32':sqlalchemy.Float(precision=23)}) res = sql.read_sql_table('test_dtypes', self.conn) - + # check precision of float64 - self.assertEqual(np.round(df['f64'].iloc[0],14), + self.assertEqual(np.round(df['f64'].iloc[0],14), np.round(res['f64'].iloc[0],14)) # check sql types meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() col_dict = meta.tables['test_dtypes'].columns - self.assertEqual(str(col_dict['f32'].type), + self.assertEqual(str(col_dict['f32'].type), str(col_dict['f64_as_f32'].type)) self.assertTrue(isinstance(col_dict['f32'].type, sqltypes.Float)) self.assertTrue(isinstance(col_dict['f64'].type, sqltypes.Float)) @@ -1512,6 +1560,19 @@ def test_schema_support(self): res2 = pdsql.read_table('test_schema_other2') tm.assert_frame_equal(res1, res2) + def test_datetime_with_time_zone(self): + # Test to see if we read the date column with timezones that + # the timezone information is converted to utc and into a + # np.datetime64 (GH #7139) + df = sql.read_sql_table("types_test_data", self.conn) + self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64), + "DateColWithTz loaded with incorrect type") + + # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00" + self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00')) + + # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00" + self.assertEqual(df.DateColWithTz[1], Timestamp('2000-06-01 07:00:00')) #------------------------------------------------------------------------------ #--- Test Sqlite / MySQL fallback @@ -1672,11 +1733,11 @@ def test_illegal_names(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) # Raise error on blank - self.assertRaises(ValueError, df.to_sql, "", self.conn, + self.assertRaises(ValueError, df.to_sql, "", self.conn, flavor=self.flavor) for ndx, weird_name in enumerate(['test_weird_name]','test_weird_name[', - 'test_weird_name`','test_weird_name"', 'test_weird_name\'', + 'test_weird_name`','test_weird_name"', 'test_weird_name\'', '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"']): df.to_sql(weird_name, self.conn, flavor=self.flavor) sql.table_exists(weird_name, self.conn) @@ -1782,12 +1843,12 @@ def test_illegal_names(self): for ndx, illegal_name in enumerate(['test_illegal_name]','test_illegal_name[', 'test_illegal_name`','test_illegal_name"', 'test_illegal_name\'', '']): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) - self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, + self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, flavor=self.flavor, index=False) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', illegal_name]) c_tbl = 'test_illegal_col_name%d'%ndx - self.assertRaises(ValueError, df2.to_sql, 'test_illegal_col_name', + self.assertRaises(ValueError, df2.to_sql, 'test_illegal_col_name', self.conn, flavor=self.flavor, index=False) @@ -1964,7 +2025,7 @@ def test_tquery(self): frame = tm.makeTimeDataFrame() sql.write_frame(frame, name='test_table', con=self.db) result = sql.tquery("select A from test_table", self.db) - expected = frame.A + expected = Series(frame.A, frame.index) # not to have name result = Series(result, frame.index) tm.assert_series_equal(result, expected) @@ -2134,6 +2195,13 @@ def setUp(self): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") + def tearDown(self): + from pymysql.err import Error + try: + self.db.close() + except Error: + pass + def test_basic(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() @@ -2302,7 +2370,7 @@ def test_tquery(self): cur.execute(drop_sql) sql.write_frame(frame, name='test_table', con=self.db, flavor='mysql') result = sql.tquery("select A from test_table", self.db) - expected = frame.A + expected = Series(frame.A, frame.index) # not to have name result = Series(result, frame.index) tm.assert_series_equal(result, expected) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 8b44be61d5f66..97bbfb0edf92c 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -290,6 +290,15 @@ def test_stata_doc_examples(self): df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path) + def test_write_preserves_original(self): + # 9795 + np.random.seed(423) + df = pd.DataFrame(np.random.randn(5,4), columns=list('abcd')) + df.ix[2, 'a':'c'] = np.nan + df_copy = df.copy() + df.to_stata('test.dta', write_index=False) + tm.assert_frame_equal(df, df_copy) + def test_encoding(self): # GH 4626, proper encoding handling @@ -866,8 +875,8 @@ def test_categorical_sorting(self): parsed_117.index = np.arange(parsed_117.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] - expected = pd.Series(pd.Categorical.from_codes(codes=codes, - categories=categories)) + cat = pd.Categorical.from_codes(codes=codes, categories=categories) + expected = pd.Series(cat, name='srh') tm.assert_series_equal(expected, parsed_115["srh"]) tm.assert_series_equal(expected, parsed_117["srh"]) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 5ab2ee4327177..cc4c43494176e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1,6 +1,7 @@ cimport numpy as np cimport cython import numpy as np +import sys from numpy cimport * @@ -10,6 +11,7 @@ cdef extern from "numpy/arrayobject.h": cdef enum NPY_TYPES: NPY_intp "NPY_INTP" + from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyDict_Contains, PyDict_Keys, Py_INCREF, PyTuple_SET_ITEM, @@ -18,7 +20,14 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyBytes_Check, PyTuple_SetItem, PyTuple_New, - PyObject_SetAttrString) + PyObject_SetAttrString, + PyBytes_GET_SIZE, + PyUnicode_GET_SIZE) + +try: + from cpython cimport PyString_GET_SIZE +except ImportError: + from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX @@ -32,7 +41,6 @@ cdef extern from "Python.h": Py_ssize_t *slicelength) except -1 - cimport cpython isnan = np.isnan @@ -896,23 +904,32 @@ def clean_index_list(list obj): return maybe_convert_objects(converted), 0 + +ctypedef fused pandas_string: + str + unicode + bytes + + @cython.boundscheck(False) @cython.wraparound(False) -def max_len_string_array(ndarray arr): +cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): """ return the maximum size of elements in a 1-dim string array """ cdef: - int i, m, l - int length = arr.shape[0] - object v + Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] + pandas_string v - m = 0 - for i from 0 <= i < length: + for i in range(length): v = arr[i] - if PyString_Check(v) or PyBytes_Check(v) or PyUnicode_Check(v): - l = len(v) + if PyString_Check(v): + l = PyString_GET_SIZE(v) + elif PyBytes_Check(v): + l = PyBytes_GET_SIZE(v) + elif PyUnicode_Check(v): + l = PyUnicode_GET_SIZE(v) - if l > m: - m = l + if l > m: + m = l return m @@ -933,7 +950,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): +def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows @@ -1306,9 +1323,10 @@ def duplicated(ndarray[object] values, take_last=False): def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: - Py_ssize_t i, group_size, n, lab, start + Py_ssize_t i, group_size, n, start + int64_t lab object slobj - ndarray[int64_t] starts + ndarray[int64_t] starts, ends n = len(labels) @@ -1318,13 +1336,16 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): start = 0 group_size = 0 for i in range(n): - group_size += 1 lab = labels[i] - if i == n - 1 or lab != labels[i + 1]: - starts[lab] = start - ends[lab] = start + group_size - start += group_size - group_size = 0 + if lab < 0: + start += 1 + else: + group_size += 1 + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 return starts, ends diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d13781d6fa132..b28e0587264d4 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h": int col void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) - char* COLITER_NEXT(coliter_t it) + void COLITER_NEXT(coliter_t, const char *) parser_t* parser_new() @@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h": inline int to_longlong(char *item, long long *p_value) # inline int to_longlong_thousands(char *item, long long *p_value, # char tsep) - int to_boolean(char *item, uint8_t *val) + int to_boolean(const char *item, uint8_t *val) cdef extern from "parser/io.h": @@ -541,6 +541,17 @@ cdef class TextReader: self.parser.cb_io = NULL self.parser.cb_cleanup = NULL + if self.compression == 'infer': + if isinstance(source, basestring): + if source.endswith('.gz'): + self.compression = 'gzip' + elif source.endswith('.bz2'): + self.compression = 'bz2' + else: + self.compression = None + else: + self.compression = None + if self.compression: if self.compression == 'gzip': import gzip @@ -1279,7 +1290,7 @@ cdef _string_box_factorize(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1296,7 +1307,7 @@ cdef _string_box_factorize(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1333,7 +1344,7 @@ cdef _string_box_utf8(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1350,7 +1361,7 @@ cdef _string_box_utf8(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1388,7 +1399,7 @@ cdef _string_box_decode(parser_t *parser, int col, Py_ssize_t i, size size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1407,7 +1418,7 @@ cdef _string_box_decode(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1444,7 +1455,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, int error Py_ssize_t i, j coliter_t it - char *word + const char *word = NULL char *data ndarray result @@ -1454,7 +1465,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, coliter_setup(&it, parser, col, line_start) for i in range(line_end - line_start): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) strncpy(data, word, width) data += width @@ -1469,7 +1480,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL char *p_end double *data double NA = na_values[np.float64] @@ -1485,7 +1496,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1509,7 +1520,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: @@ -1530,7 +1541,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL int64_t *data ndarray result @@ -1544,7 +1555,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1561,7 +1572,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return None, None else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: @@ -1578,7 +1589,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1592,7 +1603,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1608,7 +1619,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) error = to_boolean(word, data) if error != 0: @@ -1625,7 +1636,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1639,7 +1650,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1667,7 +1678,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(true_hashset, word) if k != true_hashset.n_buckets: @@ -1688,33 +1699,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, return result.view(np.bool_), na_count -cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end, - kh_str_t *na_hashset): - cdef: - int error - Py_ssize_t i - size_t lines - coliter_t it - char *word - ndarray[uint8_t, cast=True] result - khiter_t k - - lines = line_end - line_start - result = np.empty(lines, dtype=np.bool_) - - coliter_setup(&it, parser, col, line_start) - for i in range(lines): - word = COLITER_NEXT(it) - - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: - result[i] = 1 - else: - result[i] = 0 - - return result - cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1897,7 +1881,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL char *errors = "strict" ndarray[object] result object val @@ -1909,17 +1893,17 @@ cdef _apply_converter(object f, parser_t *parser, int col, if not PY3 and c_encoding == NULL: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyBytes_FromString(word) result[i] = f(val) elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'): for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_FromString(word) result[i] = f(val) else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_Decode(word, strlen(word), c_encoding, errors) result[i] = f(val) diff --git a/pandas/rpy/__init__.py b/pandas/rpy/__init__.py index 899b684ecbff9..bad7ebc580ce2 100644 --- a/pandas/rpy/__init__.py +++ b/pandas/rpy/__init__.py @@ -5,7 +5,10 @@ import warnings warnings.warn("The pandas.rpy module is deprecated and will be " "removed in a future version. We refer to external packages " - "like rpy2, found here: http://rpy.sourceforge.net", FutureWarning) + "like rpy2. " + "\nSee here for a guide on how to port your code to rpy2: " + "http://pandas.pydata.org/pandas-docs/stable/r_interface.html", + FutureWarning) try: from .common import importr, r, load_data diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 30b06c8a93142..83278fe12d641 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -100,7 +100,7 @@ def __init__(self, data=None, index=None, columns=None, mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: - data = {} + data = DataFrame() if index is None: index = Index([]) @@ -115,7 +115,7 @@ def __init__(self, data=None, index=None, columns=None, index=index, kind=self._default_kind, fill_value=self._default_fill_value) - mgr = dict_to_manager(data, columns, index) + mgr = df_to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) @@ -155,7 +155,7 @@ def _init_dict(self, data, index, columns, dtype=None): kind=self._default_kind, fill_value=self._default_fill_value, copy=True) - sdict = {} + sdict = DataFrame() for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary @@ -181,7 +181,7 @@ def _init_dict(self, data, index, columns, dtype=None): if c not in sdict: sdict[c] = sp_maker(nan_vec) - return dict_to_manager(sdict, columns, index) + return df_to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): data = _prep_ndarray(data, copy=False) @@ -228,12 +228,12 @@ def _unpickle_sparse_frame_compat(self, state): else: index = idx - series_dict = {} + series_dict = DataFrame() for col, (sp_index, sp_values) in compat.iteritems(series): series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, fill_value=fv) - self._data = dict_to_manager(series_dict, columns, index) + self._data = df_to_manager(series_dict, columns, index) self._default_fill_value = fv self._default_kind = kind @@ -418,7 +418,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_index, new_columns = this.index, this.columns if level is not None: - raise NotImplementedError + raise NotImplementedError("'level' argument is not supported") if self.empty and other.empty: return SparseDataFrame(index=new_index).__finalize__(self) @@ -459,9 +459,9 @@ def _combine_match_index(self, other, func, level=None, fill_value=None): new_data = {} if fill_value is not None: - raise NotImplementedError + raise NotImplementedError("'fill_value' argument is not supported") if level is not None: - raise NotImplementedError + raise NotImplementedError("'level' argument is not supported") new_index = self.index.union(other.index) this = self @@ -494,9 +494,9 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): # possible for this to happen, which is bothersome if fill_value is not None: - raise NotImplementedError + raise NotImplementedError("'fill_value' argument is not supported") if level is not None: - raise NotImplementedError + raise NotImplementedError("'level' argument is not supported") new_data = {} @@ -567,10 +567,10 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, raise TypeError('Reindex by level not supported for sparse') if com.notnull(fill_value): - raise NotImplementedError + raise NotImplementedError("'fill_value' argument is not supported") if limit: - raise NotImplementedError + raise NotImplementedError("'limit' argument is not supported") # TODO: fill value handling sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) @@ -737,13 +737,13 @@ def applymap(self, func): """ return self.apply(lambda x: lmap(func, x)) -def dict_to_manager(sdict, columns, index): - """ create and return the block manager from a dict of series, columns, index """ +def df_to_manager(sdf, columns, index): + """ create and return the block manager from a dataframe of series, columns, index """ # from BlockManager perspective axes = [_ensure_index(columns), _ensure_index(index)] - return create_block_manager_from_arrays([sdict[c] for c in columns], columns, axes) + return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes) def stack_sparse_frame(frame): diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index d3f3f59f264c5..34256acfb0e60 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -32,7 +32,7 @@ def __set__(self, obj, value): value = _ensure_index(value) if isinstance(value, MultiIndex): - raise NotImplementedError + raise NotImplementedError("value cannot be a MultiIndex") for v in compat.itervalues(obj._frames): setattr(v, self.frame_attr, value) @@ -159,7 +159,7 @@ def _get_items(self): def _set_items(self, new_items): new_items = _ensure_index(new_items) if isinstance(new_items, MultiIndex): - raise NotImplementedError + raise NotImplementedError("itemps cannot be a MultiIndex") # need to create new frames dict diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 2c328e51b5090..f53cc66bee961 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -399,7 +399,7 @@ def abs(self): res_sp_values = np.abs(self.sp_values) return self._constructor(res_sp_values, index=self.index, sparse_index=self.sp_index, - fill_value=self.fill_value) + fill_value=self.fill_value).__finalize__(self) def get(self, label, default=None): """ diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index f187e7f883e11..a7a78ba226a0b 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -36,7 +36,7 @@ import pandas.tests.test_panel as test_panel import pandas.tests.test_series as test_series -from .test_array import assert_sp_array_equal +from pandas.sparse.tests.test_array import assert_sp_array_equal import warnings warnings.filterwarnings(action='ignore', category=FutureWarning) @@ -281,7 +281,7 @@ def test_constructor_nonnan(self): arr = [0, 0, 0, nan, nan] sp_series = SparseSeries(arr, fill_value=0) assert_equal(sp_series.values.values, arr) - + # GH 9272 def test_constructor_empty(self): sp = SparseSeries() @@ -509,6 +509,21 @@ def _check_inplace_op(iop, op): _check_inplace_op( getattr(operator, "i%s" % op), getattr(operator, op)) + def test_abs(self): + s = SparseSeries([1, 2, -3], name='x') + expected = SparseSeries([1, 2, 3], name='x') + result = s.abs() + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = np.abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + def test_reindex(self): def _compare_with_series(sps, new_index): spsre = sps.reindex(new_index) @@ -997,7 +1012,7 @@ def test_constructor_ndarray(self): ValueError, "^Column length", SparseDataFrame, self.frame.values, columns=self.frame.columns[:-1]) - # GH 9272 + # GH 9272 def test_constructor_empty(self): sp = SparseDataFrame() self.assertEqual(len(sp.index), 0) @@ -1283,7 +1298,9 @@ def _check_frame(frame): frame['E'] = to_insert expected = to_insert.to_dense().reindex( frame.index).fillna(to_insert.fill_value) - assert_series_equal(frame['E'].to_dense(), expected) + result = frame['E'].to_dense() + assert_series_equal(result, expected, check_names=False) + self.assertEqual(result.name, 'E') # insert Series frame['F'] = frame['A'].to_dense() @@ -1663,6 +1680,12 @@ def test_as_blocks(self): self.assertEqual(list(df_blocks.keys()), ['float64']) assert_frame_equal(df_blocks['float64'], df) + def test_nan_columnname(self): + # GH 8822 + nan_colname = DataFrame(Series(1.0,index=[0]),columns=[nan]) + nan_colname_sparse = nan_colname.to_sparse() + self.assertTrue(np.isnan(nan_colname_sparse.columns[0])) + def _dense_series_compare(s, f): result = f(s) @@ -1741,8 +1764,8 @@ def test_constructor(self): with tm.assertRaisesRegexp(TypeError, "input must be a dict, a 'list' was passed"): SparsePanel(['a', 'b', 'c']) - - # GH 9272 + + # GH 9272 def test_constructor_empty(self): sp = SparsePanel() self.assertEqual(len(sp.items), 0) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 575fcf386f570..598cdff30e4f7 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -37,6 +37,8 @@ cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -93,12 +95,7 @@ def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s] values, """ -take_2d_axis0_template = """@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, - ndarray[int64_t] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): +inner_take_2d_axis0_template = """\ cdef: Py_ssize_t i, j, k, n, idx %(c_type_out)s fv @@ -140,12 +137,34 @@ def take_2d_axis0_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, """ -take_2d_axis1_template = """@cython.wraparound(False) +take_2d_axis0_template = """\ +@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, +cdef inline take_2d_axis0_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, + int64_t[:] indexer, + %(c_type_out)s[:, :] out, + fill_value=np.nan): +""" + inner_take_2d_axis0_template + """ + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, ndarray[int64_t] indexer, %(c_type_out)s[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_%(name)s_%(dest)s_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +""" + inner_take_2d_axis0_template + + +inner_take_2d_axis1_template = """\ cdef: Py_ssize_t i, j, k, n, idx %(c_type_out)s fv @@ -165,9 +184,36 @@ def take_2d_axis1_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, out[i, j] = fv else: out[i, j] = %(preval)svalues[i, idx]%(postval)s - """ +take_2d_axis1_template = """\ +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, + int64_t[:] indexer, + %(c_type_out)s[:, :] out, + fill_value=np.nan): +""" + inner_take_2d_axis1_template + """ + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, + ndarray[int64_t] indexer, + %(c_type_out)s[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_%(name)s_%(dest)s_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +""" + inner_take_2d_axis1_template + + take_2d_multi_template = """@cython.wraparound(False) @cython.boundscheck(False) def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, @@ -629,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -1653,7 +1699,8 @@ def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, b = 0 if K > 1: - raise NotImplementedError + raise NotImplementedError("Argument 'values' must have only " + "one dimension") else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index cab3a84f6ffe8..428decd4dca10 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -28,6 +28,8 @@ ctypedef unsigned char UChar cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2704,10 +2706,10 @@ def take_1d_object_object(ndarray[object] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_bool_bool(uint8_t[:, :] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx uint8_t fv @@ -2747,30 +2749,41 @@ def take_2d_axis0_bool_bool(uint8_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_bool_object(uint8_t[:, :] values, +def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, ndarray[int64_t] indexer, - object[:, :] out, + uint8_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - object fv + uint8_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF False: + IF True: cdef: - object *v - object *o + uint8_t *v + uint8_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -2780,7 +2793,7 @@ def take_2d_axis0_bool_object(uint8_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) + memmove(o, v, (sizeof(uint8_t) * k)) return for i from 0 <= i < n: @@ -2790,32 +2803,32 @@ def take_2d_axis0_bool_object(uint8_t[:, :] values, out[i, j] = fv else: for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False + out[i, j] = values[idx, j] @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int8_int8(int8_t[:, :] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int8_t fv + object fv n = len(indexer) k = values.shape[1] fv = fill_value - IF True: + IF False: cdef: - int8_t *v - int8_t *o + object *v + object *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -2825,7 +2838,7 @@ def take_2d_axis0_int8_int8(int8_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) + memmove(o, v, (sizeof(object) * k)) return for i from 0 <= i < n: @@ -2835,17 +2848,28 @@ def take_2d_axis0_int8_int8(int8_t[:, :] values, out[i, j] = fv else: for j from 0 <= j < k: - out[i, j] = values[idx, j] + out[i, j] = True if values[idx, j] > 0 else False + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int8_int32(int8_t[:, :] values, +def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, ndarray[int64_t] indexer, - int32_t[:, :] out, + object[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int32_t fv + object fv n = len(indexer) k = values.shape[1] @@ -2854,13 +2878,13 @@ def take_2d_axis0_int8_int32(int8_t[:, :] values, IF False: cdef: - int32_t *v - int32_t *o + object *v + object *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -2870,7 +2894,7 @@ def take_2d_axis0_int8_int32(int8_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) + memmove(o, v, (sizeof(object) * k)) return for i from 0 <= i < n: @@ -2880,32 +2904,32 @@ def take_2d_axis0_int8_int32(int8_t[:, :] values, out[i, j] = fv else: for j from 0 <= j < k: - out[i, j] = values[idx, j] + out[i, j] = True if values[idx, j] > 0 else False @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int8_int64(int8_t[:, :] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int64_t fv + int8_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF False: + IF True: cdef: - int64_t *v - int64_t *o + int8_t *v + int8_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -2915,7 +2939,7 @@ def take_2d_axis0_int8_int64(int8_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) + memmove(o, v, (sizeof(int8_t) * k)) return for i from 0 <= i < n: @@ -2927,30 +2951,41 @@ def take_2d_axis0_int8_int64(int8_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int8_float64(int8_t[:, :] values, +def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, ndarray[int64_t] indexer, - float64_t[:, :] out, + int8_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int8_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF False: + IF True: cdef: - float64_t *v - float64_t *o + int8_t *v + int8_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -2960,7 +2995,7 @@ def take_2d_axis0_int8_float64(int8_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) + memmove(o, v, (sizeof(int8_t) * k)) return for i from 0 <= i < n: @@ -2974,28 +3009,28 @@ def take_2d_axis0_int8_float64(int8_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int16_int16(int16_t[:, :] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int16_t fv + int32_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF True: + IF False: cdef: - int16_t *v - int16_t *o + int32_t *v + int32_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3005,7 +3040,7 @@ def take_2d_axis0_int16_int16(int16_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) + memmove(o, v, (sizeof(int32_t) * k)) return for i from 0 <= i < n: @@ -3017,12 +3052,23 @@ def take_2d_axis0_int16_int16(int16_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int16_int32(int16_t[:, :] values, +def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, ndarray[int64_t] indexer, int32_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx int32_t fv @@ -3064,10 +3110,10 @@ def take_2d_axis0_int16_int32(int16_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int16_int64(int16_t[:, :] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx int64_t fv @@ -3107,15 +3153,26 @@ def take_2d_axis0_int16_int64(int16_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int16_float64(int16_t[:, :] values, +def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, ndarray[int64_t] indexer, - float64_t[:, :] out, + int64_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int64_t fv n = len(indexer) k = values.shape[1] @@ -3124,13 +3181,13 @@ def take_2d_axis0_int16_float64(int16_t[:, :] values, IF False: cdef: - float64_t *v - float64_t *o + int64_t *v + int64_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3140,7 +3197,7 @@ def take_2d_axis0_int16_float64(int16_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) + memmove(o, v, (sizeof(int64_t) * k)) return for i from 0 <= i < n: @@ -3154,28 +3211,28 @@ def take_2d_axis0_int16_float64(int16_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int32_int32(int32_t[:, :] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int32_t fv + float64_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF True: + IF False: cdef: - int32_t *v - int32_t *o + float64_t *v + float64_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3185,7 +3242,7 @@ def take_2d_axis0_int32_int32(int32_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) + memmove(o, v, (sizeof(float64_t) * k)) return for i from 0 <= i < n: @@ -3197,15 +3254,26 @@ def take_2d_axis0_int32_int32(int32_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int32_int64(int32_t[:, :] values, +def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, ndarray[int64_t] indexer, - int64_t[:, :] out, + float64_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int64_t fv + float64_t fv n = len(indexer) k = values.shape[1] @@ -3214,13 +3282,13 @@ def take_2d_axis0_int32_int64(int32_t[:, :] values, IF False: cdef: - int64_t *v - int64_t *o + float64_t *v + float64_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3230,7 +3298,7 @@ def take_2d_axis0_int32_int64(int32_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) + memmove(o, v, (sizeof(float64_t) * k)) return for i from 0 <= i < n: @@ -3244,28 +3312,28 @@ def take_2d_axis0_int32_int64(int32_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int32_float64(int32_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int16_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF False: + IF True: cdef: - float64_t *v - float64_t *o + int16_t *v + int16_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3275,7 +3343,7 @@ def take_2d_axis0_int32_float64(int32_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) + memmove(o, v, (sizeof(int16_t) * k)) return for i from 0 <= i < n: @@ -3287,15 +3355,26 @@ def take_2d_axis0_int32_float64(int32_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int64_int64(int64_t[:, :] values, +def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, ndarray[int64_t] indexer, - int64_t[:, :] out, + int16_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int64_t fv + int16_t fv n = len(indexer) k = values.shape[1] @@ -3304,13 +3383,13 @@ def take_2d_axis0_int64_int64(int64_t[:, :] values, IF True: cdef: - int64_t *v - int64_t *o + int16_t *v + int16_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3320,7 +3399,7 @@ def take_2d_axis0_int64_int64(int64_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) + memmove(o, v, (sizeof(int16_t) * k)) return for i from 0 <= i < n: @@ -3334,13 +3413,13 @@ def take_2d_axis0_int64_int64(int64_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int64_float64(int64_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int32_t fv n = len(indexer) k = values.shape[1] @@ -3349,13 +3428,13 @@ def take_2d_axis0_int64_float64(int64_t[:, :] values, IF False: cdef: - float64_t *v - float64_t *o + int32_t *v + int32_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3365,7 +3444,7 @@ def take_2d_axis0_int64_float64(int64_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) + memmove(o, v, (sizeof(int32_t) * k)) return for i from 0 <= i < n: @@ -3377,30 +3456,41 @@ def take_2d_axis0_int64_float64(int64_t[:, :] values, for j from 0 <= j < k: out[i, j] = values[idx, j] + + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_float32_float32(float32_t[:, :] values, +def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, ndarray[int64_t] indexer, - float32_t[:, :] out, + int32_t[:, :] out, fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - float32_t fv + int32_t fv n = len(indexer) k = values.shape[1] fv = fill_value - IF True: + IF False: cdef: - float32_t *v - float32_t *o + int32_t *v + int32_t *o #GH3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): for i from 0 <= i < n: idx = indexer[i] @@ -3410,7 +3500,7 @@ def take_2d_axis0_float32_float32(float32_t[:, :] values, else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) + memmove(o, v, (sizeof(int32_t) * k)) return for i from 0 <= i < n: @@ -3424,13 +3514,13 @@ def take_2d_axis0_float32_float32(float32_t[:, :] values, @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_float32_float64(float32_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int64_t fv n = len(indexer) k = values.shape[1] @@ -3439,9 +3529,166 @@ def take_2d_axis0_float32_float64(float32_t[:, :] values, IF False: cdef: - float64_t *v - float64_t *o - + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + #GH3130 if (values.strides[1] == out.strides[1] and values.strides[1] == sizeof(float64_t) and @@ -3459,114 +3706,1528 @@ def take_2d_axis0_float32_float64(float32_t[:, :] values, return for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] + else: + out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_float64_float64(float64_t[:, :] values, +def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, ndarray[int64_t] indexer, - float64_t[:, :] out, + int32_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + int32_t fv - n = len(indexer) - k = values.shape[1] + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return fv = fill_value - IF True: - cdef: - float64_t *v - float64_t *o + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): + n = len(values) + k = len(indexer) - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return + if n == 0 or k == 0: + return + + fv = fill_value for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] + else: + out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_object_object(object[:, :] values, +def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, ndarray[int64_t] indexer, - object[:, :] out, + int64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - object fv + int64_t fv - n = len(indexer) - k = values.shape[1] + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return fv = fill_value - IF False: - cdef: - object *v - object *o + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): + n = len(values) + k = len(indexer) - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return + if n == 0 or k == 0: + return + + fv = fill_value for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] + else: + out[i, j] = values[i, idx] @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_bool_bool(uint8_t[:, :] values, +def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, ndarray[int64_t] indexer, - uint8_t[:, :] out, + float64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - uint8_t fv + float64_t fv n = len(values) k = len(indexer) @@ -3583,16 +5244,15 @@ def take_2d_axis1_bool_bool(uint8_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_bool_object(uint8_t[:, :] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - object fv + int32_t fv n = len(values) k = len(indexer) @@ -3608,17 +5268,28 @@ def take_2d_axis1_bool_object(uint8_t[:, :] values, if idx == -1: out[i, j] = fv else: - out[i, j] = True if values[i, idx] > 0 else False + out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int8_int8(int8_t[:, :] values, +def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, ndarray[int64_t] indexer, - int8_t[:, :] out, + int32_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int8_t fv + int32_t fv n = len(values) k = len(indexer) @@ -3635,16 +5306,15 @@ def take_2d_axis1_int8_int8(int8_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int8_int32(int8_t[:, :] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int32_t fv + int64_t fv n = len(values) k = len(indexer) @@ -3662,12 +5332,23 @@ def take_2d_axis1_int8_int32(int8_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int8_int64(int8_t[:, :] values, +def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, ndarray[int64_t] indexer, int64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx int64_t fv @@ -3687,13 +5368,12 @@ def take_2d_axis1_int8_int64(int8_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int8_float64(int8_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx float64_t fv @@ -3714,15 +5394,26 @@ def take_2d_axis1_int8_float64(int8_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int16_int16(int16_t[:, :] values, +def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, ndarray[int64_t] indexer, - int16_t[:, :] out, + float64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int16_t fv + float64_t fv n = len(values) k = len(indexer) @@ -3739,16 +5430,15 @@ def take_2d_axis1_int16_int16(int16_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int16_int32(int16_t[:, :] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int32_t fv + int64_t fv n = len(values) k = len(indexer) @@ -3766,12 +5456,23 @@ def take_2d_axis1_int16_int32(int16_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int16_int64(int16_t[:, :] values, +def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, ndarray[int64_t] indexer, int64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx int64_t fv @@ -3791,13 +5492,12 @@ def take_2d_axis1_int16_int64(int16_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int16_float64(int16_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx float64_t fv @@ -3818,15 +5518,26 @@ def take_2d_axis1_int16_float64(int16_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int32_int32(int32_t[:, :] values, +def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, ndarray[int64_t] indexer, - int32_t[:, :] out, + float64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - int32_t fv + float64_t fv n = len(values) k = len(indexer) @@ -3843,16 +5554,15 @@ def take_2d_axis1_int32_int32(int32_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int32_int64(int32_t[:, :] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int64_t fv + float32_t fv n = len(values) k = len(indexer) @@ -3870,15 +5580,26 @@ def take_2d_axis1_int32_int64(int32_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int32_float64(int32_t[:, :] values, +def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, ndarray[int64_t] indexer, - float64_t[:, :] out, + float32_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + float32_t fv n = len(values) k = len(indexer) @@ -3895,16 +5616,15 @@ def take_2d_axis1_int32_float64(int32_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int64_int64(int64_t[:, :] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - int64_t fv + float64_t fv n = len(values) k = len(indexer) @@ -3922,12 +5642,23 @@ def take_2d_axis1_int64_int64(int64_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int64_float64(int64_t[:, :] values, +def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, ndarray[int64_t] indexer, float64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx float64_t fv @@ -3947,16 +5678,15 @@ def take_2d_axis1_int64_float64(int64_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_float32_float32(float32_t[:, :] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - float32_t fv + float64_t fv n = len(values) k = len(indexer) @@ -3974,12 +5704,23 @@ def take_2d_axis1_float32_float32(float32_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_float32_float64(float32_t[:, :] values, +def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, ndarray[int64_t] indexer, float64_t[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx float64_t fv @@ -3999,16 +5740,15 @@ def take_2d_axis1_float32_float64(float32_t[:, :] values, out[i, j] = fv else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_float64_float64(float64_t[:, :] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): +cdef inline take_2d_axis1_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - float64_t fv + object fv n = len(values) k = len(indexer) @@ -4026,12 +5766,23 @@ def take_2d_axis1_float64_float64(float64_t[:, :] values, else: out[i, j] = values[i, idx] + @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_object_object(object[:, :] values, +def take_2d_axis1_object_object(ndarray[object, ndim=2] values, ndarray[int64_t] indexer, object[:, :] out, fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. cdef: Py_ssize_t i, j, k, n, idx object fv @@ -4052,7 +5803,6 @@ def take_2d_axis1_object_object(object[:, :] values, else: out[i, j] = values[i, idx] - @cython.wraparound(False) @cython.boundscheck(False) def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, @@ -5784,7 +7534,8 @@ def group_ohlc_float64(ndarray[float64_t, ndim=2] out, b = 0 if K > 1: - raise NotImplementedError + raise NotImplementedError("Argument 'values' must have only " + "one dimension") else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: @@ -5857,7 +7608,8 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out, b = 0 if K > 1: - raise NotImplementedError + raise NotImplementedError("Argument 'values' must have only " + "one dimension") else: for i in range(N): while b < ngroups - 1 and i >= bins[b]: diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index dbe6f2f1f8351..55d5e37fc19ee 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -1,8 +1,11 @@ +import sys cimport util from tslib import NaT from datetime import datetime, timedelta iNaT = util.get_nat() +cdef bint PY2 = sys.version_info[0] == 2 + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -38,10 +41,10 @@ _TYPE_MAP = { 'f' : 'floating', 'complex128': 'complex', 'c' : 'complex', - 'string': 'string', - 'S' : 'string', - 'unicode': 'unicode', - 'U' : 'unicode', + 'string': 'string' if PY2 else 'bytes', + 'S' : 'string' if PY2 else 'bytes', + 'unicode': 'unicode' if PY2 else 'string', + 'U' : 'unicode' if PY2 else 'string', 'bool': 'boolean', 'b' : 'boolean', 'datetime64[ns]' : 'datetime64', @@ -181,6 +184,10 @@ def infer_dtype(object _values): if is_unicode_array(values): return 'unicode' + elif PyBytes_Check(val): + if is_bytes_array(values): + return 'bytes' + elif is_timedelta(val): if is_timedelta_or_timedelta64_array(values): return 'timedelta' @@ -196,11 +203,6 @@ def infer_dtype(object _values): return 'mixed' -def infer_dtype_list(list values): - cdef: - Py_ssize_t i, n = len(values) - pass - def is_possible_datetimelike_array(object arr): # determine if we have a possible datetimelike (or null-like) array @@ -253,7 +255,6 @@ def is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.bool_): return True @@ -277,7 +278,6 @@ def is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.integer): return True @@ -298,7 +298,6 @@ def is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.integer): return True @@ -321,7 +320,6 @@ def is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.floating): return True @@ -342,9 +340,9 @@ def is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj - if issubclass(values.dtype.type, (np.string_, np.unicode_)): + if ((PY2 and issubclass(values.dtype.type, np.string_)) or + not PY2 and issubclass(values.dtype.type, np.unicode_)): return True elif values.dtype == np.object_: objbuf = values @@ -363,7 +361,6 @@ def is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.unicode_): return True @@ -381,8 +378,29 @@ def is_unicode_array(ndarray values): return False +def is_bytes_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + + if issubclass(values.dtype.type, np.bytes_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyBytes_Check(objbuf[i]): + return False + return True + else: + return False + + def is_datetime_array(ndarray[object] values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -399,7 +417,7 @@ def is_datetime_array(ndarray[object] values): return null_count != n def is_datetime64_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -416,7 +434,7 @@ def is_datetime64_array(ndarray values): return null_count != n def is_timedelta_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -431,7 +449,7 @@ def is_timedelta_array(ndarray values): return null_count != n def is_timedelta64_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -447,7 +465,7 @@ def is_timedelta64_array(ndarray values): def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -462,7 +480,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return null_count != n def is_date_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) if n == 0: return False for i in range(n): @@ -471,7 +489,7 @@ def is_date_array(ndarray[object] values): return True def is_time_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) if n == 0: return False for i in range(n): @@ -484,7 +502,7 @@ def is_period(object o): return isinstance(o,Period) def is_period_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) from pandas.tseries.period import Period if n == 0: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 975142ebacc2a..3be17f17d6afa 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -38,7 +38,7 @@ See LICENSE for the license * RESTORE_FINAL (2): * Put the file position at the next byte after the * data read from the file_buffer. -* +* #define RESTORE_NOT 0 #define RESTORE_INITIAL 1 #define RESTORE_FINAL 2 @@ -304,7 +304,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->stream_len, &self->stream_cap, nbytes * 2, sizeof(char), &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { @@ -334,7 +334,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words_len, &self->words_cap, nbytes, sizeof(char*), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -371,7 +371,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->lines + 1, &self->lines_cap, nbytes, sizeof(int), &status); - TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -398,7 +398,7 @@ static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) self->error_msg = (char*) malloc(64); sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); @@ -463,7 +463,6 @@ static void append_warning(parser_t *self, const char *msg) { static int end_line(parser_t *self) { int fields; - khiter_t k; /* for hash set detection */ int ex_fields = self->expected_fields; char *msg; @@ -483,7 +482,7 @@ static int end_line(parser_t *self) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; - + // skip the tokens from this bad line self->line_start[self->lines] += fields; @@ -605,12 +604,11 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int status; size_t bytes_read; - void *src = self->source; status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); - TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", nbytes, bytes_read, status)); self->datalen = bytes_read; @@ -704,7 +702,7 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != ((kh_int64_t*)self->skipset)->n_buckets ); } else { @@ -757,11 +755,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -786,7 +782,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) else self->state = EAT_CRNL; break; - } + } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; @@ -853,11 +849,12 @@ int tokenize_delimited(parser_t *self, size_t line_limit) ; else { // backtrack /* We have to use i + 1 because buf has been incremented but not i */ - while (i + 1 > self->datapos && *buf != '\n') { + do { --buf; --i; - } - if (i + 1 > self->datapos) // reached a newline rather than the beginning + } while (i + 1 > self->datapos && *buf != '\n'); + + if (*buf == '\n') // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i; @@ -1077,7 +1074,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) // Next character in file c = *buf++; - TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); @@ -1093,11 +1090,9 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == self->lineterminator) { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -1172,11 +1167,12 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) ; else { // backtrack /* We have to use i + 1 because buf has been incremented but not i */ - while (i + 1 > self->datapos && *buf != self->lineterminator) { + do { --buf; --i; - } - if (i + 1 > self->datapos) // reached a newline rather than the beginning + } while (i + 1 > self->datapos && *buf != self->lineterminator); + + if (*buf == self->lineterminator) // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i; @@ -1342,7 +1338,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // Next character in file c = *buf++; - TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); @@ -1391,11 +1387,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } else if (c == '\n') { @@ -1756,7 +1750,7 @@ int parser_trim_buffers(parser_t *self) { /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); @@ -1877,7 +1871,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ @@ -2039,7 +2033,7 @@ int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) return status; }*/ -int to_boolean(char *item, uint8_t *val) { +int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; @@ -2363,7 +2357,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, num_digits++; num_decimals++; } - + if (num_digits >= max_digits) // consume extra decimal digits while (isdigit(*p)) ++p; @@ -2659,4 +2653,4 @@ uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) *error = 0; return number; } -*/ \ No newline at end of file +*/ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 694a73ec78153..d3777e858b6ca 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -228,9 +228,12 @@ coliter_t *coliter_new(parser_t *self, int i); /* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ // #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] -#define COLITER_NEXT(iter) iter.words[*iter.line_start++ + iter.col] +#define COLITER_NEXT(iter, word) do { \ + const int i = *iter.line_start++ + iter.col; \ + word = i < *iter.line_start ? iter.words[i]: ""; \ + } while(0) -parser_t* parser_new(); +parser_t* parser_new(void); int parser_init(parser_t *self); @@ -270,6 +273,6 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, in //int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); //int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); -int to_boolean(char *item, uint8_t *val); +int to_boolean(const char *item, uint8_t *val); #endif // _PARSER_COMMON_H_ diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index cc6ad3defe4f3..b4a4930e09d68 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -710,6 +710,10 @@ cdef class Period(object): dt = value if freq is None: raise ValueError('Must supply freq for datetime value') + elif isinstance(value, np.datetime64): + dt = Timestamp(value) + if freq is None: + raise ValueError('Must supply freq for datetime value') elif isinstance(value, date): dt = datetime(year=value.year, month=value.month, day=value.day) if freq is None: diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index a30286479c847..445530bc5b00c 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -862,7 +862,7 @@ def _non_null_values(x): if mock_mean: # check that mean equals mock_mean expected = mock_mean(x) - assert_equal(mean_x, expected) + assert_equal(mean_x, expected.astype('float64')) # check that correlation of a series with itself is either 1 or NaN corr_x_x = corr(x, x) @@ -1254,7 +1254,8 @@ def _check_pairwise_moment(self, func, *args, **kwargs): actual = panel.ix[:, 1, 5] expected = func(self.frame[1], self.frame[5], *args, **kwargs) - tm.assert_series_equal(actual, expected) + tm.assert_series_equal(actual, expected, check_names=False) + self.assertEqual(actual.name, 5) def test_flex_binary_moment(self): # GH3155 @@ -1549,6 +1550,7 @@ def test_moment_functions_zero_length(self): df1_expected = df1 df1_expected_panel = Panel(items=df1.index, major_axis=df1.columns, minor_axis=df1.columns) df2 = DataFrame(columns=['a']) + df2['a'] = df2['a'].astype('float64') df2_expected = df2 df2_expected_panel = Panel(items=df2.index, major_axis=df2.columns, minor_axis=df2.columns) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index b91c46377267a..e9526f9fad1ac 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -244,6 +244,26 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): else: self.assertRaises(AttributeError, lambda : getattr(o,op)) + def test_binary_ops_docs(self): + from pandas import DataFrame, Panel + op_map = {'add': '+', + 'sub': '-', + 'mul': '*', + 'mod': '%', + 'pow': '**', + 'truediv': '/', + 'floordiv': '//'} + for op_name in ['add', 'sub', 'mul', 'mod', 'pow', 'truediv', 'floordiv']: + for klass in [Series, DataFrame, Panel]: + operand1 = klass.__name__.lower() + operand2 = 'other' + op = op_map[op_name] + expected_str = ' '.join([operand1, op, operand2]) + self.assertTrue(expected_str in getattr(klass, op_name).__doc__) + + # reverse version of the binary ops + expected_str = ' '.join([operand2, op, operand1]) + self.assertTrue(expected_str in getattr(klass, 'r' + op_name).__doc__) class TestIndexOps(Ops): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py old mode 100644 new mode 100755 index 7f4b3fcb94dfa..21b64378cfc24 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp +from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex from pandas.core.config import option_context import pandas.core.common as com @@ -93,6 +93,24 @@ def test_constructor_unsortable(self): else: Categorical.from_array(arr, ordered=True) + def test_is_equal_dtype(self): + + # test dtype comparisons between cats + + c1 = Categorical(list('aabca'),categories=list('abc'),ordered=False) + c2 = Categorical(list('aabca'),categories=list('cab'),ordered=False) + c3 = Categorical(list('aabca'),categories=list('cab'),ordered=True) + self.assertTrue(c1.is_dtype_equal(c1)) + self.assertTrue(c2.is_dtype_equal(c2)) + self.assertTrue(c3.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(c2)) + self.assertFalse(c1.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(Index(list('aabca')))) + self.assertFalse(c1.is_dtype_equal(c1.astype(object))) + self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,categories=list('cab')))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,ordered=True))) + def test_constructor(self): exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) @@ -114,6 +132,9 @@ def f(): Categorical([1,2], [1,2,np.nan, np.nan]) self.assertRaises(ValueError, f) + # The default should be unordered + c1 = Categorical(["a", "b", "c", "a"]) + self.assertFalse(c1.ordered) # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) @@ -221,6 +242,18 @@ def f(): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) cat = Categorical([1,2], categories=[1,2,3]) + # this is a legitimate constructor + with tm.assert_produces_warning(None): + c = Categorical(np.array([],dtype='int64'),categories=[3,2,1],ordered=True) + + def test_constructor_with_index(self): + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci))) + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci.astype(object),categories=ci.categories))) + def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull returned a scalar # for a generator @@ -367,6 +400,13 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) + # Make sure that unequal comparison take the categories order in account + cat_rev = pd.Categorical(list("abc"), categories=list("cba"), ordered=True) + exp = np.array([True, False, False]) + res = cat_rev > "b" + self.assert_numpy_array_equal(res, exp) + + def test_na_flags_int_categories(self): # #1457 @@ -481,6 +521,15 @@ def test_empty_print(self): expected = ("[], Categories (0, object): []") self.assertEqual(expected, repr(factor)) + def test_print_none_width(self): + # GH10087 + a = pd.Series(pd.Categorical([1,2,3,4], name="a")) + exp = u("0 1\n1 2\n2 3\n3 4\n" + + "Name: a, dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + + with option_context("display.width", None): + self.assertEqual(exp, repr(a)) + def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') @@ -717,6 +766,19 @@ def f(): cat.add_categories(["d"]) self.assertRaises(ValueError, f) + # GH 9927 + cat = Categorical(list("abc"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) + # test with Series, np.array, index, list + res = cat.add_categories(Series(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(np.array(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(Index(["d", "e"])) + self.assert_categorical_equal(res, expected) + res = cat.add_categories(["d", "e"]) + self.assert_categorical_equal(res, expected) + def test_remove_categories(self): cat = Categorical(["a","b","c","a"], ordered=True) old = cat.copy() @@ -1077,6 +1139,20 @@ def test_reflected_comparison_with_scalars(self): self.assert_numpy_array_equal(cat > cat[0], [False, True, True]) self.assert_numpy_array_equal(cat[0] < cat, [False, True, True]) + def test_comparison_with_unknown_scalars(self): + # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 and following + # comparisons with scalars not in categories should raise for unequal comps, but not for + # equal/not equal + cat = pd.Categorical([1, 2, 3], ordered=True) + + self.assertRaises(TypeError, lambda: cat < 4) + self.assertRaises(TypeError, lambda: cat > 4) + self.assertRaises(TypeError, lambda: 4 < cat) + self.assertRaises(TypeError, lambda: 4 > cat) + + self.assert_numpy_array_equal(cat == 4 , [False, False, False]) + self.assert_numpy_array_equal(cat != 4 , [True, True, True]) + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -1753,6 +1829,35 @@ def f(x): expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) + # GH 9921 + # Monotonic + df = DataFrame({"a": [5, 15, 25]}) + c = pd.cut(df.a, bins=[0,10,20,30,40]) + tm.assert_series_equal(df.a.groupby(c).transform(sum), df['a']) + tm.assert_series_equal(df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + + # Filter + tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) + tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + + # Non-monotonic + df = DataFrame({"a": [5, 15, 25, -5]}) + c = pd.cut(df.a, bins=[-10, 0,10,20,30,40]) + tm.assert_series_equal(df.a.groupby(c).transform(sum), df['a']) + tm.assert_series_equal(df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + + # GH 9603 + df = pd.DataFrame({'a': [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4]) + result = df.groupby(c).apply(len) + expected = pd.Series([1, 0, 0, 0], index=c.values.categories) + expected.index.name = 'a' + tm.assert_series_equal(result, expected) + def test_pivot_table(self): raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True) @@ -2390,6 +2495,18 @@ def test_comparisons(self): exp = Series([False, False, True]) tm.assert_series_equal(res, exp) + scalar = base[1] + res = cat > scalar + exp = Series([False, False, True]) + exp2 = cat.values > scalar + tm.assert_series_equal(res, exp) + tm.assert_numpy_array_equal(res.values, exp2) + res_rev = cat_rev > scalar + exp_rev = Series([True, False, False]) + exp_rev2 = cat_rev.values > scalar + tm.assert_series_equal(res_rev, exp_rev) + tm.assert_numpy_array_equal(res_rev.values, exp_rev2) + # Only categories with same categories can be compared def f(): cat > cat_rev @@ -2408,9 +2525,29 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) - # Categoricals can be compared to scalar values - res = cat_rev > base[0] - tm.assert_series_equal(res, exp) + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + cat = Series(Categorical(list("abc"), ordered=False)) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + + # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 and following + # comparisons with scalars not in categories should raise for unequal comps, but not for + # equal/not equal + cat = Series(Categorical(list("abc"), ordered=True)) + + self.assertRaises(TypeError, lambda: cat < "d") + self.assertRaises(TypeError, lambda: cat > "d") + self.assertRaises(TypeError, lambda: "d" < cat) + self.assertRaises(TypeError, lambda: "d" > cat) + + self.assert_series_equal(cat == "d" , Series([False, False, False])) + self.assert_series_equal(cat != "d" , Series([True, True, True])) + # And test NaN handling... cat = Series(Categorical(["a","b","c", np.nan])) @@ -2506,6 +2643,8 @@ def f(): dfx['grade'].cat.categories self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + def test_concat_preserve(self): + # GH 8641 # series concat not preserving category dtype s = Series(list('abc'),dtype='category') @@ -2523,6 +2662,28 @@ def f(): expected = Series(list('abcabc'),index=[0,1,2,0,1,2]).astype('category') tm.assert_series_equal(result, expected) + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }) + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }) + tm.assert_frame_equal(result, expected) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }).set_index('B') + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }).set_index('B') + tm.assert_frame_equal(result, expected) + + # wrong catgories + df3 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('abc')) }).set_index('B') + self.assertRaises(TypeError, lambda : pd.concat([df2,df3])) + def test_append(self): cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] @@ -2658,6 +2819,14 @@ def cmp(a,b): self.assertRaises(TypeError, lambda : invalid(s)) + def test_astype_categorical(self): + + cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + tm.assert_categorical_equal(cat,cat.astype('category')) + tm.assert_almost_equal(np.array(cat),cat.astype('object')) + + self.assertRaises(ValueError, lambda : cat.astype(float)) + def test_to_records(self): # GH8626 diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d0ae7c9988c8d..c3d39fcdf906f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -524,6 +524,47 @@ def test_is_recompilable(): for f in fails: assert not com.is_re_compilable(f) +def test_random_state(): + import numpy.random as npr + # Check with seed + state = com._random_state(5) + assert_equal(state.uniform(), npr.RandomState(5).uniform()) + + # Check with random state object + state2 = npr.RandomState(10) + assert_equal(com._random_state(state2).uniform(), npr.RandomState(10).uniform()) + + # check with no arg random state + assert isinstance(com._random_state(), npr.RandomState) + + # Error for floats or strings + with tm.assertRaises(ValueError): + com._random_state('test') + + with tm.assertRaises(ValueError): + com._random_state(5.5) + + +def test_maybe_match_name(): + + matched = com._maybe_match_name(Series([1], name='x'), Series([2], name='x')) + assert(matched == 'x') + + matched = com._maybe_match_name(Series([1], name='x'), Series([2], name='y')) + assert(matched is None) + + matched = com._maybe_match_name(Series([1]), Series([2], name='x')) + assert(matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), Series([2])) + assert(matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), [2]) + assert(matched == 'x') + + matched = com._maybe_match_name([1], Series([2], name='y')) + assert(matched == 'y') + class TestTake(tm.TestCase): # standard incompatible fill error @@ -608,8 +649,9 @@ def _test_dtype(dtype, fill_value, out_dtype): _test_dtype(np.bool_, '', np.object_) def test_2d_with_out(self): - def _test_dtype(dtype, can_hold_na): + def _test_dtype(dtype, can_hold_na, writeable=True): data = np.random.randint(0, 2, (5, 3)).astype(dtype) + data.flags.writeable = writeable indexer = [2, 1, 0, 1] out0 = np.empty((4, 3), dtype=dtype) @@ -640,18 +682,22 @@ def _test_dtype(dtype, can_hold_na): # no exception o/w data.take(indexer, out=out, axis=i) - _test_dtype(np.float64, True) - _test_dtype(np.float32, True) - _test_dtype(np.uint64, False) - _test_dtype(np.uint32, False) - _test_dtype(np.uint16, False) - _test_dtype(np.uint8, False) - _test_dtype(np.int64, False) - _test_dtype(np.int32, False) - _test_dtype(np.int16, False) - _test_dtype(np.int8, False) - _test_dtype(np.object_, True) - _test_dtype(np.bool, False) + for writeable in [True, False]: + # Check that take_nd works both with writeable arrays (in which + # case fast typed memoryviews implementation) and read-only + # arrays alike. + _test_dtype(np.float64, True, writeable=writeable) + _test_dtype(np.float32, True, writeable=writeable) + _test_dtype(np.uint64, False, writeable=writeable) + _test_dtype(np.uint32, False, writeable=writeable) + _test_dtype(np.uint16, False, writeable=writeable) + _test_dtype(np.uint8, False, writeable=writeable) + _test_dtype(np.int64, False, writeable=writeable) + _test_dtype(np.int32, False, writeable=writeable) + _test_dtype(np.int16, False, writeable=writeable) + _test_dtype(np.int8, False, writeable=writeable) + _test_dtype(np.object_, True, writeable=writeable) + _test_dtype(np.bool, False, writeable=writeable) def test_2d_fill_nonna(self): def _test_dtype(dtype, fill_value, out_dtype): diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index ce32c8af99a73..a7129bca59a7f 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -14,7 +14,7 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series, Index, Timestamp, MultiIndex +from pandas import DataFrame, Series, Index, Timestamp, MultiIndex, date_range, NaT import pandas.core.format as fmt import pandas.util.testing as tm @@ -298,6 +298,21 @@ def mkframe(n): com.pprint_thing(df._repr_fits_horizontal_()) self.assertTrue(has_expanded_repr(df)) + def test_str_max_colwidth(self): + # GH 7856 + df = pd.DataFrame([{'a': 'foo', 'b': 'bar', + 'c': 'uncomfortably long line with lots of stuff', + 'd': 1}, + {'a': 'foo', 'b': 'bar', 'c': 'stuff', 'd': 1}]) + df.set_index(['a', 'b', 'c']) + self.assertTrue(str(df) == ' a b c d\n' + '0 foo bar uncomfortably long line with lots of stuff 1\n' + '1 foo bar stuff 1') + with option_context('max_colwidth', 20): + self.assertTrue(str(df) == ' a b c d\n' + '0 foo bar uncomfortably lo... 1\n' + '1 foo bar stuff 1') + def test_auto_detect(self): term_width, term_height = get_terminal_size() fac = 1.05 # Arbitrary large factor to exceed term widht @@ -2194,6 +2209,28 @@ def test_to_latex_multiindex(self): x & y & a \\ \bottomrule \end{tabular} +""" + self.assertEqual(result, expected) + + df = DataFrame.from_dict({ + ('c1', 0): pd.Series(dict((x, x) for x in range(4))), + ('c1', 1): pd.Series(dict((x, x + 4) for x in range(4))), + ('c2', 0): pd.Series(dict((x, x) for x in range(4))), + ('c2', 1): pd.Series(dict((x, x + 4) for x in range(4))), + ('c3', 0): pd.Series(dict((x, x) for x in range(4))), + }).T + result = df.to_latex() + expected = r"""\begin{tabular}{llrrrr} +\toprule + & & 0 & 1 & 2 & 3 \\ +\midrule +c1 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c2 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c3 & 0 & 0 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} """ self.assertEqual(result, expected) @@ -2458,7 +2495,7 @@ def test_to_string(self): def test_freq_name_separation(self): s = Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10), name=0) + index=date_range('1/1/2000', periods=10), name=0) result = repr(s) self.assertTrue('Freq: D, Name: 0' in result) @@ -2519,7 +2556,6 @@ def test_float_trim_zeros(self): def test_datetimeindex(self): - from pandas import date_range, NaT index = date_range('20130102',periods=6) s = Series(1,index=index) result = s.to_string() @@ -2537,7 +2573,6 @@ def test_datetimeindex(self): def test_timedelta64(self): - from pandas import date_range from datetime import datetime, timedelta Series(np.array([1100, 20], dtype='timedelta64[ns]')).to_string() @@ -2986,6 +3021,25 @@ def test_format(self): self.assertEqual(result[0], " 12") self.assertEqual(result[1], " 0") + def test_output_significant_digits(self): + # Issue #9764 + + # In case default display precision changes: + with pd.option_context('display.precision', 7): + # DataFrame example from issue #9764 + d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) + + expected_output={ + (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,8):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07\n6 5.000100e-07\n7 6.000000e-07', + (8,16):' col1\n8 9.999000e-07\n9 1.000000e-06\n10 1.000100e-06\n11 2.000000e-06\n12 4.999000e-06\n13 5.000000e-06\n14 5.000100e-06\n15 6.000000e-06', + (9,16):' col1\n9 0.000001\n10 0.000001\n11 0.000002\n12 0.000005\n13 0.000005\n14 0.000005\n15 0.000006' + } + + for (start, stop), v in expected_output.items(): + self.assertEqual(str(d[start:stop]), v) + class TestRepr_timedelta64(tm.TestCase): @@ -3123,6 +3177,44 @@ def test_date_nanos(self): result = fmt.Datetime64Formatter(x).get_result() self.assertEqual(result[0].strip(), "1970-01-01 00:00:00.000000200") + def test_dates_display(self): + + # 10170 + # make sure that we are consistently display date formatting + x = Series(date_range('20130101 09:00:00',periods=5,freq='D')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-05 09:00:00") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='s')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:04") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='ms')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.004") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='us')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000004") + + x = Series(date_range('20130101 09:00:00',periods=5,freq='N')) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 09:00:00.000000000") + self.assertEqual(result[1].strip(), "NaT") + self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000000004") class TestNaTFormatting(tm.TestCase): def test_repr(self): @@ -3159,13 +3251,13 @@ def test_date_explict_date_format(self): class TestDatetimeIndexUnicode(tm.TestCase): def test_dates(self): text = str(pd.to_datetime([datetime(2013,1,1), datetime(2014,1,1)])) - self.assertTrue("[2013-01-01," in text) - self.assertTrue(", 2014-01-01]" in text) + self.assertTrue("['2013-01-01'," in text) + self.assertTrue(", '2014-01-01']" in text) def test_mixed(self): text = str(pd.to_datetime([datetime(2013,1,1), datetime(2014,1,1,12), datetime(2014,1,1)])) - self.assertTrue("[2013-01-01 00:00:00," in text) - self.assertTrue(", 2014-01-01 00:00:00]" in text) + self.assertTrue("'2013-01-01 00:00:00'," in text) + self.assertTrue("'2014-01-01 00:00:00']" in text) class TestStringRepTimestamp(tm.TestCase): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cdda087b27613..4964d13f7ac28 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -16,7 +16,7 @@ from pandas.compat import( map, zip, range, long, lrange, lmap, lzip, - OrderedDict, u, StringIO + OrderedDict, u, StringIO, string_types ) from pandas import compat @@ -31,9 +31,9 @@ import pandas.core.common as com import pandas.core.format as fmt import pandas.core.datetools as datetools -from pandas import (DataFrame, Index, Series, notnull, isnull, +from pandas import (DataFrame, Index, Series, Panel, notnull, isnull, MultiIndex, DatetimeIndex, Timestamp, date_range, - read_csv, timedelta_range, Timedelta, + read_csv, timedelta_range, Timedelta, CategoricalIndex, option_context) import pandas as pd from pandas.parser import CParserError @@ -784,6 +784,16 @@ def test_setitem_None(self): assert_series_equal(self.frame[None], self.frame['A']) repr(self.frame) + def test_setitem_empty(self): + # GH 9596 + df = pd.DataFrame({'a': ['1', '2', '3'], + 'b': ['11', '22', '33'], + 'c': ['111', '222', '333']}) + + result = df.copy() + result.loc[result.b.isnull(), 'a'] = result.a + assert_frame_equal(result, df) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] @@ -2376,6 +2386,36 @@ def test_set_index_pass_arrays(self): expected = df.set_index(['A', 'B'], drop=False) assert_frame_equal(result, expected, check_names=False) # TODO should set_index check_names ? + def test_construction_with_categorical_index(self): + + ci = tm.makeCategoricalIndex(10) + + # with Categorical + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci.values }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index, ci, check_names=False) + self.assertEqual(idf.index.name, 'B') + + # from a CategoricalIndex + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index, ci, check_names=False) + self.assertEqual(idf.index.name, 'B') + + idf = df.set_index('B').reset_index().set_index('B') + str(idf) + tm.assert_index_equal(idf.index, ci, check_names=False) + self.assertEqual(idf.index.name, 'B') + + new_df = idf.reset_index() + new_df.index = df.B + tm.assert_index_equal(new_df.index, ci, check_names=False) + self.assertEqual(idf.index.name, 'B') + def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], @@ -2751,6 +2791,59 @@ def test_insert_error_msmgs(self): with assertRaisesRegexp(TypeError, msg): df['gr'] = df.groupby(['b', 'c']).count() + def test_frame_subclassing_and_slicing(self): + # Subclass frame and ensure it returns the right class on slicing it + # In reference to PR 9632 + + class CustomSeries(Series): + @property + def _constructor(self): + return CustomSeries + + def custom_series_function(self): + return 'OK' + + class CustomDataFrame(DataFrame): + "Subclasses pandas DF, fills DF with simulation results, adds some custom plotting functions." + + def __init__(self, *args, **kw): + super(CustomDataFrame, self).__init__(*args, **kw) + + @property + def _constructor(self): + return CustomDataFrame + + _constructor_sliced = CustomSeries + + def custom_frame_function(self): + return 'OK' + + data = {'col1': range(10), + 'col2': range(10)} + cdf = CustomDataFrame(data) + + # Did we get back our own DF class? + self.assertTrue(isinstance(cdf, CustomDataFrame)) + + # Do we get back our own Series class after selecting a column? + cdf_series = cdf.col1 + self.assertTrue(isinstance(cdf_series, CustomSeries)) + self.assertEqual(cdf_series.custom_series_function(), 'OK') + + # Do we get back our own DF class after slicing row-wise? + cdf_rows = cdf[1:5] + self.assertTrue(isinstance(cdf_rows, CustomDataFrame)) + self.assertEqual(cdf_rows.custom_frame_function(), 'OK') + + # Make sure sliced part of multi-index frame is custom class + mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')]) + cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) + self.assertTrue(isinstance(cdf_multi['A'], CustomDataFrame)) + + mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')]) + cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) + self.assertTrue(isinstance(cdf_multi2['A'], CustomSeries)) + def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), @@ -3125,6 +3218,19 @@ def test_constructor_empty_list(self): expected = DataFrame(index=[]) assert_frame_equal(df, expected) + # GH 9939 + df = DataFrame([], columns=['A', 'B']) + expected = DataFrame({}, columns=['A', 'B']) + assert_frame_equal(df, expected) + + # Empty generator: list(empty_gen()) == [] + def empty_gen(): + return + yield + + df = DataFrame(empty_gen(), columns=['A', 'B']) + assert_frame_equal(df, expected) + def test_constructor_list_of_lists(self): # GH #484 l = [[1, 'a'], [2, 'b']] @@ -4192,6 +4298,30 @@ def test_astype_cast_nan_int(self): df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) self.assertRaises(ValueError, df.astype, np.int64) + def test_astype_str(self): + # GH9757 + a = Series(date_range('2010-01-04', periods=5)) + b = Series(date_range('3/6/2012 00:00', periods=5, tz='US/Eastern')) + c = Series([Timedelta(x, unit='d') for x in range(5)]) + d = Series(range(5)) + e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + + df = DataFrame({'a' : a, 'b' : b, 'c' : c, 'd' : d, 'e' : e}) + + # Test str and unicode on python 2.x and just str on python 3.x + for tt in set([str, compat.text_type]): + result = df.astype(tt) + + expected = DataFrame({ + 'a' : list(map(tt, a.values)), + 'b' : list(map(tt, b.values)), + 'c' : list(map(tt, c.values)), + 'd' : list(map(tt, d.values)), + 'e' : list(map(tt, e.values)), + }) + + assert_frame_equal(result, expected) + def test_array_interface(self): result = np.sqrt(self.frame) tm.assert_isinstance(result, type(self.frame)) @@ -5944,6 +6074,20 @@ def test_boolean_comparison(self): self.assertRaises(ValueError, lambda : df == (2,2)) self.assertRaises(ValueError, lambda : df == [2,2]) + def test_equals_different_blocks(self): + # GH 9330 + df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2], + "C": ["w","z"]}) + df1 = df0.reset_index()[["A","B","C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + self.assertTrue(df0._data.blocks[0].dtype != + df1._data.blocks[0].dtype) + # do the real tests + self.assert_frame_equal(df0, df1) + self.assertTrue(df0.equals(df1)) + self.assertTrue(df1.equals(df0)) + def test_to_csv_from_csv(self): pname = '__tmp_to_csv_from_csv__' @@ -7409,6 +7553,26 @@ def test_drop_names(self): self.assertEqual(obj.columns.name, 'second') self.assertEqual(list(df.columns), ['d', 'e', 'f']) + self.assertRaises(ValueError, df.drop, ['g']) + self.assertRaises(ValueError, df.drop, ['g'], 1) + + # errors = 'ignore' + dropped = df.drop(['g'], errors='ignore') + expected = Index(['a', 'b', 'c'], name='first') + self.assert_index_equal(dropped.index, expected) + + dropped = df.drop(['b', 'g'], errors='ignore') + expected = Index(['a', 'c'], name='first') + self.assert_index_equal(dropped.index, expected) + + dropped = df.drop(['g'], axis=1, errors='ignore') + expected = Index(['d', 'e', 'f'], name='second') + self.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(['d', 'g'], axis=1, errors='ignore') + expected = Index(['e', 'f'], name='second') + self.assert_index_equal(dropped.columns, expected) + def test_dropEmptyRows(self): N = len(self.frame.index) mat = randn(N) @@ -7787,6 +7951,19 @@ def test_drop(self): assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.ix[[2], :]) assert_frame_equal(simple.drop([0, 3], axis='index'), simple.ix[[1, 2], :]) + self.assertRaises(ValueError, simple.drop, 5) + self.assertRaises(ValueError, simple.drop, 'C', 1) + self.assertRaises(ValueError, simple.drop, [1, 5]) + self.assertRaises(ValueError, simple.drop, ['A', 'C'], 1) + + # errors = 'ignore' + assert_frame_equal(simple.drop(5, errors='ignore'), simple) + assert_frame_equal(simple.drop([0, 5], errors='ignore'), + simple.ix[[1, 2, 3], :]) + assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) + assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), + simple[['B']]) + #non-unique - wheee! nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')), columns=['a', 'a', 'b']) @@ -9976,6 +10153,12 @@ def test_diff_float_n(self): xp = self.tsframe.diff(1) assert_frame_equal(rs, xp) + def test_diff_axis(self): + # GH 9727 + df = DataFrame([[1., 2.], [3., 4.]]) + assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.], [np.nan, 1.]])) + assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2., 2.]])) + def test_pct_change(self): rs = self.tsframe.pct_change(fill_method=None) assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) @@ -10663,6 +10846,19 @@ def test_sort_index(self): with assertRaisesRegexp(ValueError, msg): frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_index_categorical_index(self): + + df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + + result = df.sort_index() + expected = df.iloc[[4,0,1,5,2,3]] + assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[3,2,5,1,0,4]] + assert_frame_equal(result, expected) + def test_sort_nan(self): # GH3917 nan = np.nan @@ -11274,6 +11470,39 @@ def test_dataframe_clip(self): self.assertTrue((clipped_df.values[ub_mask] == ub).all() == True) self.assertTrue((clipped_df.values[mask] == df.values[mask]).all() == True) + def test_clip_against_series(self): + # GH #6966 + + df = DataFrame(np.random.randn(1000, 2)) + lb = Series(np.random.randn(1000)) + ub = lb + 1 + + clipped_df = df.clip(lb, ub, axis=0) + + for i in range(2): + lb_mask = df.iloc[:, i] <= lb + ub_mask = df.iloc[:, i] >= ub + mask = ~lb_mask & ~ub_mask + + assert_series_equal(clipped_df.loc[lb_mask, i], lb[lb_mask]) + assert_series_equal(clipped_df.loc[ub_mask, i], ub[ub_mask]) + assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + + def test_clip_against_frame(self): + df = DataFrame(np.random.randn(1000, 2)) + lb = DataFrame(np.random.randn(1000, 2)) + ub = lb + 1 + + clipped_df = df.clip(lb, ub) + + lb_mask = df <= lb + ub_mask = df >= ub + mask = ~lb_mask & ~ub_mask + + assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + assert_frame_equal(clipped_df[mask], df[mask]) + def test_get_X_columns(self): # numeric and object columns @@ -11684,16 +11913,14 @@ def test_mode(self): df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], "B": [10, 10, 10, np.nan, 3, 4], "C": [8, 8, 8, 9, 9, 9], - "D": range(6), + "D": np.arange(6,dtype='int64'), "E": [8, 8, 1, 1, 3, 3]}) assert_frame_equal(df[["A"]].mode(), pd.DataFrame({"A": [12]})) - assert_frame_equal(df[["D"]].mode(), - pd.DataFrame(pd.Series([], dtype="int64"), - columns=["D"])) - assert_frame_equal(df[["E"]].mode(), - pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"), - columns=["E"])) + expected = pd.Series([], dtype='int64', name='D').to_frame() + assert_frame_equal(df[["D"]].mode(), expected) + expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame() + assert_frame_equal(df[["E"]].mode(), expected) assert_frame_equal(df[["A", "B"]].mode(), pd.DataFrame({"A": [12], "B": [10.]})) assert_frame_equal(df.mode(), @@ -11715,7 +11942,7 @@ def test_mode(self): com.pprint_thing(b) assert_frame_equal(a, b) # should work with heterogeneous types - df = pd.DataFrame({"A": range(6), + df = pd.DataFrame({"A": np.arange(6,dtype='int64'), "B": pd.date_range('2011', periods=6), "C": list('abcdef')}) exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), @@ -12357,6 +12584,31 @@ def test_unstack_bool(self): ['c', 'l']])) assert_frame_equal(rs, xp) + def test_unstack_level_binding(self): + # GH9856 + mi = pd.MultiIndex( + levels=[[u('foo'), u('bar')], [u('one'), u('two')], + [u('a'), u('b')]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], + names=[u('first'), u('second'), u('third')]) + s = pd.Series(0, index=mi) + result = s.unstack([1, 2]).stack(0) + + expected_mi = pd.MultiIndex( + levels=[['foo', 'bar'], ['one', 'two']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=['first', 'second']) + + expected = pd.DataFrame(np.array([[np.nan, 0], + [0, np.nan], + [np.nan, 0], + [0, np.nan]], + dtype=np.float64), + index=expected_mi, + columns=pd.Index(['a', 'b'], name='third')) + + self.assert_frame_equal(result, expected) + def test_unstack_to_series(self): # check reversibility data = self.frame.unstack() @@ -14059,12 +14311,21 @@ def test_assign(self): assert_frame_equal(result, expected) def test_assign_multiple(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) - expected = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], - 'D': [1, 2, 3], 'E': [4, 5, 6]}) - # column order isn't preserved - assert_frame_equal(result.reindex_like(expected), expected) + expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], + [3, 6, 9, 3, 6]], columns=list('ABCDE')) + assert_frame_equal(result, expected) + + def test_assign_alphabetical(self): + # GH 9818 + df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + result = df.assign(D=df.A + df.B, C=df.A - df.B) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + result = df.assign(C=df.A - df.B, D=df.A + df.B) + assert_frame_equal(result, expected) def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) @@ -14099,6 +14360,27 @@ def _constructor(self): # GH9776 self.assertEqual(df.iloc[0:1, :].testattr, 'XXX') + def test_to_panel_expanddim(self): + # GH 9762 + + class SubclassedFrame(DataFrame): + @property + def _constructor_expanddim(self): + return SubclassedPanel + + class SubclassedPanel(Panel): + pass + + index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)]) + df = SubclassedFrame({'X':[1, 2, 3], 'Y': [4, 5, 6]}, index=index) + result = df.to_panel() + self.assertTrue(isinstance(result, SubclassedPanel)) + expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]], + items=['X', 'Y'], major_axis=[0], + minor_axis=[0, 1, 2], + dtype='int64') + tm.assert_panel_equal(result, expected) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3dd8c2594cd46..3f751310438e4 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -354,6 +354,178 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) + def test_sample(self): + # Fixes issue: 2419 + + o = self._construct(shape=10) + + ### + # Check behavior of random_state argument + ### + + # Check for stability when receives seed or random state -- run 10 times. + for test in range(10): + seed = np.random.randint(0,100) + self._compare(o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed)) + self._compare(o.sample(frac=0.7,random_state=seed), o.sample(frac=0.7, random_state=seed)) + + self._compare(o.sample(n=4, random_state=np.random.RandomState(test)), + o.sample(n=4, random_state=np.random.RandomState(test))) + + self._compare(o.sample(frac=0.7,random_state=np.random.RandomState(test)), + o.sample(frac=0.7, random_state=np.random.RandomState(test))) + + + # Check for error when random_state argument invalid. + with tm.assertRaises(ValueError): + o.sample(random_state='astring!') + + ### + # Check behavior of `frac` and `N` + ### + + # Giving both frac and N throws error + with tm.assertRaises(ValueError): + o.sample(n=3, frac=0.3) + + # Check that raises right error for negative lengths + with tm.assertRaises(ValueError): + o.sample(n=-3) + with tm.assertRaises(ValueError): + o.sample(frac=-0.3) + + # Make sure float values of `n` give error + with tm.assertRaises(ValueError): + o.sample(n= 3.2) + + # Check lengths are right + self.assertTrue(len(o.sample(n=4) == 4)) + self.assertTrue(len(o.sample(frac=0.34) == 3)) + self.assertTrue(len(o.sample(frac=0.36) == 4)) + + ### + # Check weights + ### + + # Weight length must be right + with tm.assertRaises(ValueError): + o.sample(n=3, weights=[0,1]) + + with tm.assertRaises(ValueError): + bad_weights = [0.5]*11 + o.sample(n=3, weights=bad_weights) + + # Check won't accept negative weights + with tm.assertRaises(ValueError): + bad_weights = [-0.1]*10 + o.sample(n=3, weights=bad_weights) + + # Check inf and -inf throw errors: + with tm.assertRaises(ValueError): + weights_with_inf = [0.1]*10 + weights_with_inf[0] = np.inf + o.sample(n=3, weights=weights_with_inf) + + with tm.assertRaises(ValueError): + weights_with_ninf = [0.1]*10 + weights_with_ninf[0] = -np.inf + o.sample(n=3, weights=weights_with_ninf) + + + # A few dataframe test with degenerate weights. + easy_weight_list = [0]*10 + easy_weight_list[5] = 1 + + df = pd.DataFrame({'col1':range(10,20), + 'col2':range(20,30), + 'colString': ['a']*10, + 'easyweights':easy_weight_list}) + sample1 = df.sample(n=1, weights='easyweights') + assert_frame_equal(sample1, df.iloc[5:6]) + + # Ensure proper error if string given as weight for Series, panel, or + # DataFrame with axis = 1. + s = Series(range(10)) + with tm.assertRaises(ValueError): + s.sample(n=3, weights='weight_column') + + panel = pd.Panel(items = [0,1,2], major_axis = [2,3,4], minor_axis = [3,4,5]) + with tm.assertRaises(ValueError): + panel.sample(n=1, weights='weight_column') + + with tm.assertRaises(ValueError): + df.sample(n=1, weights='weight_column', axis = 1) + + # Check weighting key error + with tm.assertRaises(KeyError): + df.sample(n=3, weights='not_a_real_column_name') + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan]*10 + weights_with_nan[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) + + # Check None are also replaced by zeros. + weights_with_None = [None]*10 + weights_with_None[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0]*10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + + + ### + # Test axis argument + ### + + # Test axis argument + df = pd.DataFrame({'col1':range(10), 'col2':['a']*10}) + second_column_weight = [0,1] + assert_frame_equal(df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) + + # Different axis arg types + assert_frame_equal(df.sample(n=1, axis='columns', weights=second_column_weight), + df[['col2']]) + + weight = [0]*10 + weight[5] = 0.5 + assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), + df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis='index', weights=weight), + df.iloc[5:6]) + + + # Check out of range axis values + with tm.assertRaises(ValueError): + df.sample(n=1, axis=2) + + with tm.assertRaises(ValueError): + df.sample(n=1, axis='not_a_name') + + with tm.assertRaises(ValueError): + s = pd.Series(range(10)) + s.sample(n=1, axis=1) + + # Test weight length compared to correct axis + with tm.assertRaises(ValueError): + df.sample(n=1, axis=1, weights=[0.5]*10) + + # Check weights with axis = 1 + easy_weight_list = [0]*3 + easy_weight_list[2] = 1 + + df = pd.DataFrame({'col1':range(10,20), + 'col2':range(20,30), + 'colString': ['a']*10}) + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) + assert_frame_equal(sample1, df[['colString']]) + + # Test default axes + p = pd.Panel(items = ['a','b','c'], major_axis=[2,4,6], minor_axis=[1,3,5]) + assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42)) + assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) def test_size_compat(self): # GH8846 diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3ce4e150326a2..82f4b8c05ca06 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -439,6 +439,38 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None, else: raise AssertionError + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xoff = all(not g.gridOn for g in self.plt.gca().xaxis.get_major_ticks()) + yoff = all(not g.gridOn for g in self.plt.gca().yaxis.get_major_ticks()) + return not(xoff and yoff) + + spndx=1 + for kind in kinds: + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, **kws) + self.assertFalse(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, grid=False, **kws) + self.assertFalse(is_grid_on()) + + if kind != 'pie': + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, **kws) + self.assertTrue(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, grid=True, **kws) + self.assertTrue(is_grid_on()) @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -553,6 +585,29 @@ def test_ts_area_lim(self): self.assertEqual(xmin, line[0]) self.assertEqual(xmax, line[-1]) + def test_label(self): + s = Series([1, 2]) + ax = s.plot(label='LABEL', legend=True) + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['None']) + self.plt.close() + # get name from index + s.name = 'NAME' + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['NAME']) + self.plt.close() + # override the default + ax = s.plot(legend=True, label='LABEL') + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + # Add lebel info, but don't draw + ax = s.plot(legend=False, label='LABEL') + self.assertEqual(ax.get_legend(), None) # Hasn't been drawn + ax.legend() # draw it + self._check_legend_labels(ax, labels=['LABEL']) + def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] s = Series(values) @@ -592,6 +647,26 @@ def test_bar_log(self): ax = Series([200, 500]).plot(log=True, kind='bar') assert_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([200, 500]).plot(log=True, kind='barh') + assert_array_equal(ax.xaxis.get_ticklocs(), expected) + tm.close() + + # GH 9905 + expected = np.array([1.0e-03, 1.0e-02, 1.0e-01, 1.0e+00]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((1.0e-04, expected, 1.0e+01)) + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') + assert_array_equal(ax.get_ylim(), (0.001, 0.10000000000000001)) + assert_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') + assert_array_equal(ax.get_xlim(), (0.001, 0.10000000000000001)) + assert_array_equal(ax.xaxis.get_ticklocs(), expected) @slow def test_bar_ignore_index(self): @@ -678,6 +753,18 @@ def test_hist_df_kwargs(self): ax = df.plot(kind='hist', bins=5) self.assertEqual(len(ax.patches), 10) + @slow + def test_hist_df_with_nonnumerics(self): + # GH 9853 + with tm.RNGContext(1): + df = DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) + df['E'] = ['x', 'y'] * 5 + ax = df.plot(kind='hist', bins=5) + self.assertEqual(len(ax.patches), 20) + + ax = df.plot(kind='hist') # bins=10 + self.assertEqual(len(ax.patches), 40) + @slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) @@ -1053,6 +1140,12 @@ def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) + @slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(Series([1,2,3]), + plotting._series_kinds + plotting._common_kinds) + @tm.mplskip class TestDataFramePlots(TestPlotBase): @@ -1154,6 +1247,22 @@ def test_plot(self): self.assertEqual(len(axes), 1) self.assertIs(ax.get_axes(), axes[0]) + def test_color_and_style_arguments(self): + df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + # passing both 'color' and 'style' arguments should be allowed + # if there is no color symbol in the style strings: + ax = df.plot(color = ['red', 'black'], style = ['-', '--']) + # check that the linestyles are correctly set: + linestyle = [line.get_linestyle() for line in ax.lines] + self.assertEqual(linestyle, ['-', '--']) + # check that the colors are correctly set: + color = [line.get_color() for line in ax.lines] + self.assertEqual(color, ['red', 'black']) + # passing both 'color' and 'style' arguments should not be allowed + # if there is a color symbol in the style strings: + with tm.assertRaises(ValueError): + df.plot(color = ['red', 'black'], style = ['k-', 'r--']) + def test_nonnumeric_exclude(self): df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) ax = df.plot() @@ -1486,6 +1595,19 @@ def test_subplots_ts_share_axes(self): for ax in axes[[0, 1, 2], [2]].ravel(): self._check_visible(ax.get_yticklabels(), visible=False) + def test_subplots_sharex_axes_existing_axes(self): + # GH 9158 + d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]} + df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14')) + + axes = df[['A', 'B']].plot(subplots=True) + df['C'].plot(ax=axes[0], secondary_y=True) + + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + for ax in axes.ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + def test_negative_log(self): df = - DataFrame(rand(6, 4), index=list(string.ascii_letters[:6]), @@ -1581,7 +1703,10 @@ def test_line_lim(self): self.assertEqual(xmax, lines[0].get_data()[0][-1]) axes = df.plot(secondary_y=True, subplots=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) for ax in axes: + self.assertTrue(hasattr(ax, 'left_ax')) + self.assertFalse(hasattr(ax, 'right_ax')) xmin, xmax = ax.get_xlim() lines = ax.get_lines() self.assertEqual(xmin, lines[0].get_data()[0][0]) @@ -3339,6 +3464,12 @@ def test_sharey_and_ax(self): "y label is invisible but shouldn't") + @slow + def test_df_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(DataFrame({'a':[1,2,3],'b':[2,3,4]}), + plotting._dataframe_kinds, kws={'x':'a','y':'b'}) + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index e7001eb09f20c..0789e20df3945 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,7 +8,7 @@ from numpy import nan from pandas import date_range,bdate_range, Timestamp -from pandas.core.index import Index, MultiIndex, Int64Index +from pandas.core.index import Index, MultiIndex, Int64Index, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) @@ -297,9 +297,9 @@ def test_nth(self): # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g,sort=False).first() - expected.index = range(1,10) - result = s.groupby(g).nth(0,dropna='all') - assert_series_equal(result,expected) + expected.index = pd.Index(range(1,10), name=0) + result = s.groupby(g).nth(0, dropna='all') + assert_series_equal(result, expected) # doc example df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -430,6 +430,21 @@ def test_grouper_creation_bug(self): expected = s.groupby(level='one').sum() assert_series_equal(result, expected) + def test_grouper_getting_correct_binner(self): + + # GH 10063 + # using a non-time-based grouper and a time-based grouper + # and specifying levels + df = DataFrame({'A' : 1 }, + index=pd.MultiIndex.from_product([list('ab'), + date_range('20130101',periods=80)], + names=['one','two'])) + result = df.groupby([pd.Grouper(level='one'),pd.Grouper(level='two',freq='M')]).sum() + expected = DataFrame({'A' : [31,28,21,31,28,21]}, + index=MultiIndex.from_product([list('ab'),date_range('20130101',freq='M',periods=3)], + names=['one','two'])) + assert_frame_equal(result, expected) + def test_grouper_iter(self): self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) @@ -684,7 +699,6 @@ def test_get_group(self): expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) assert_panel_equal(gp, expected) - # GH 5267 # be datelike friendly df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', @@ -807,9 +821,10 @@ def test_apply_issues(self): # GH 5789 # don't auto coerce dates df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) - expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18']) + exp_idx = pd.Index(['2011.05.16','2011.05.17','2011.05.18'], dtype=object, name='date') + expected = Series(['00:00','02:00','02:00'], index=exp_idx) result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()]) - assert_series_equal(result,expected) + assert_series_equal(result, expected) def test_len(self): df = tm.makeTimeDataFrame() @@ -959,6 +974,12 @@ def demean(arr): g = df.groupby(pd.TimeGrouper('M')) g.transform(lambda x: x-1) + # GH 9700 + df = DataFrame({'a' : range(5, 10), 'b' : range(5)}) + result = df.groupby('a').transform(max) + expected = DataFrame({'b' : range(5)}) + tm.assert_frame_equal(result, expected) + def test_transform_fast(self): df = DataFrame( { 'id' : np.arange( 100000 ) / 3, @@ -1003,6 +1024,14 @@ def test_transform_broadcast(self): for idx in gp.index: assert_fp_equal(res.xs(idx), agged[idx]) + def test_transform_dtype(self): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform('mean') + expected = DataFrame([[1.5], [1.5]]) + assert_frame_equal(result, expected) + def test_transform_bug(self): # GH 5712 # transforming on a datetime column @@ -1692,7 +1721,8 @@ def test_groupby_as_index_apply(self): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) - exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) assert_index_equal(res_as_apply, exp_as_apply) assert_index_equal(res_not_as_apply, exp_not_as_apply) @@ -1712,6 +1742,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) empty_not_as = DataFrame(columns=df.columns) + empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) assert_frame_equal(empty_not_as, g_not_as.head(0)) assert_frame_equal(empty_not_as, g_not_as.tail(0)) assert_frame_equal(empty_not_as, g_not_as.head(-1)) @@ -1727,6 +1759,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) assert_frame_equal(empty_as, g_as.head(0)) assert_frame_equal(empty_as, g_as.tail(0)) assert_frame_equal(empty_as, g_as.head(-1)) @@ -1914,6 +1948,8 @@ def _testit(op): for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + result = op(grouped)['C'] assert_series_equal(result, exp) @@ -1966,7 +2002,7 @@ def test_cython_agg_nothing_to_agg_with_dates(self): def test_groupby_timedelta_cython_count(self): df = DataFrame({'g': list('ab' * 2), 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([2, 2], index=['a', 'b'], name='delt') + expected = Series([2, 2], index=pd.Index(['a', 'b'], name='g'), name='delt') result = df.groupby('g').delt.count() tm.assert_series_equal(expected, result) @@ -2377,13 +2413,13 @@ def test_count_object(self): df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) result = df.groupby('c').a.count() - expected = pd.Series([3, 3], index=[2, 3], name='a') + expected = pd.Series([3, 3], index=pd.Index([2, 3], name='c'), name='a') tm.assert_series_equal(result, expected) df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) result = df.groupby('c').a.count() - expected = pd.Series([1, 3], index=[2, 3], name='a') + expected = pd.Series([1, 3], index=pd.Index([2, 3], name='c'), name='a') tm.assert_series_equal(result, expected) def test_count_cross_type(self): # GH8169 @@ -2800,6 +2836,49 @@ def test_groupby_list_infer_array_like(self): result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + def test_groupby_nat_exclude(self): + # GH 6992 + df = pd.DataFrame({'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'), + np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', + np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [[1, 7], [3, 5]] + keys = sorted(grouped.groups.keys()) + self.assertEqual(len(keys), 2) + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + self.assertEqual(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + self.assertEqual(grouped.ngroups, 2) + expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5])} + for k in grouped.indices: + self.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + self.assertEqual(nan_df['nan'].dtype, 'float64') + self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]') + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + self.assertEqual(grouped.groups, {}) + self.assertEqual(grouped.ngroups, 0) + self.assertEqual(grouped.indices, {}) + self.assertRaises(KeyError, grouped.get_group, np.nan) + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + def test_dictify(self): dict(iter(self.df.groupby('A'))) dict(iter(self.df.groupby(['A', 'B']))) @@ -3370,12 +3449,11 @@ def test_groupby_datetime_categorical(self): cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) - expected.index.name = 'myfactor' + expected.index = CategoricalIndex(expected.index,categories=expected.index,name='myfactor',ordered=True) assert_frame_equal(result, expected) self.assertEqual(result.index.name, cats.name) @@ -3390,6 +3468,26 @@ def test_groupby_datetime_categorical(self): expected.index.names = ['myfactor', None] assert_frame_equal(desc_result, expected) + def test_groupby_categorical_index(self): + + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) + df = DataFrame(np.repeat(np.arange(20),4).reshape(-1,4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex @@ -3518,6 +3616,8 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) @@ -3525,6 +3625,7 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], @@ -5053,6 +5154,17 @@ def test_groupby_categorical_two_columns(self): "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) tm.assert_frame_equal(res, exp) + def test_groupby_apply_all_none(self): + # Tests to make sure no errors if apply function returns all None + # values. Issue 9684. + test_df = DataFrame({'groups': [0,0,1,1], 'random_vars': [8,7,4,5]}) + + def test_func(x): + pass + result = test_df.groupby('groups').apply(test_func) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 39db387045f12..ed84c9764dd84 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,14 +12,10 @@ import numpy as np from numpy.testing import assert_array_equal -from pandas import period_range, date_range - -from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, - InvalidIndexError, NumericIndex) -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex -from pandas.core.series import Series +from pandas import (period_range, date_range, Categorical, Series, + Index, Float64Index, Int64Index, MultiIndex, + CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) +from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) from pandas import compat @@ -41,6 +37,11 @@ class Base(object): _holder = None _compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes'] + def setup_indices(self): + # setup the test indices in the self.indicies dict + for name, ind in self.indices.items(): + setattr(self, name, ind) + def verify_pickle(self,index): unpickled = self.round_trip_pickle(index) self.assertTrue(index.equals(unpickled)) @@ -98,6 +99,7 @@ def f(): def test_reindex_base(self): idx = self.create_index() expected = np.arange(idx.size) + actual = idx.get_indexer(idx) assert_array_equal(expected, actual) @@ -118,28 +120,18 @@ def test_ndarray_compat_properties(self): idx.nbytes idx.values.nbytes + def test_repr_roundtrip(self): -class TestIndex(Base, tm.TestCase): - _holder = Index - _multiprocess_can_split_ = True + idx = self.create_index() + tm.assert_index_equal(eval(repr(idx)),idx) - def setUp(self): - self.indices = dict( - unicodeIndex = tm.makeUnicodeIndex(100), - strIndex = tm.makeStringIndex(100), - dateIndex = tm.makeDateIndex(100), - intIndex = tm.makeIntIndex(100), - floatIndex = tm.makeFloatIndex(100), - boolIndex = Index([True,False]), - empty = Index([]), - tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], - [1, 2, 3])) - ) - for name, ind in self.indices.items(): - setattr(self, name, ind) + def test_str(self): - def create_index(self): - return Index(list('abcde')) + # test the string repr + idx = self.create_index() + idx.name = 'foo' + self.assertTrue("'foo'" in str(idx)) + self.assertTrue(idx.__class__.__name__ in str(idx)) def test_wrong_number_names(self): def testit(ind): @@ -150,14 +142,18 @@ def testit(ind): def test_set_name_methods(self): new_name = "This is the new name for this index" - indices = (self.dateIndex, self.intIndex, self.unicodeIndex, - self.empty) - for ind in indices: + for ind in self.indices.values(): + + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + original_name = ind.name new_ind = ind.set_names([new_name]) self.assertEqual(new_ind.name, new_name) self.assertEqual(ind.name, original_name) res = ind.rename(new_name, inplace=True) + # should return None self.assertIsNone(res) self.assertEqual(ind.name, new_name) @@ -167,46 +163,258 @@ def test_set_name_methods(self): # ind.set_names("a") with assertRaisesRegexp(ValueError, "Level must be None"): ind.set_names("a", level=0) - # rename in place just leaves tuples and other containers alone - name = ('A', 'B') - ind = self.intIndex - ind.rename(name, inplace=True) - self.assertEqual(ind.name, name) - self.assertEqual(ind.names, [name]) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.strIndex).__name__): - hash(self.strIndex) + # rename in place just leaves tuples and other containers alone + name = ('A', 'B') + ind.rename(name, inplace=True) + self.assertEqual(ind.name, name) + self.assertEqual(ind.names, [name]) - def test_new_axis(self): - new_index = self.dateIndex[None, :] - self.assertEqual(new_index.ndim, 2) - tm.assert_isinstance(new_index, np.ndarray) + def test_hash_error(self): + for ind in self.indices.values(): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(ind).__name__): + hash(ind) def test_copy_and_deepcopy(self): from copy import copy, deepcopy - for func in (copy, deepcopy): - idx_copy = func(self.strIndex) - self.assertIsNot(idx_copy, self.strIndex) - self.assertTrue(idx_copy.equals(self.strIndex)) + for ind in self.indices.values(): - new_copy = self.strIndex.copy(deep=True, name="banana") - self.assertEqual(new_copy.name, "banana") - new_copy2 = self.intIndex.copy(dtype=int) - self.assertEqual(new_copy2.dtype.kind, 'i') + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + + for func in (copy, deepcopy): + idx_copy = func(ind) + self.assertIsNot(idx_copy, ind) + self.assertTrue(idx_copy.equals(ind)) + + new_copy = ind.copy(deep=True, name="banana") + self.assertEqual(new_copy.name, "banana") def test_duplicates(self): - idx = Index([0, 0, 0]) - self.assertFalse(idx.is_unique) + for ind in self.indices.values(): + + if not len(ind): + continue + idx = self._holder([ind[0]]*5) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) def test_sort(self): - self.assertRaises(TypeError, self.strIndex.sort) + for ind in self.indices.values(): + self.assertRaises(TypeError, ind.sort) def test_mutability(self): - self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo') + for ind in self.indices.values(): + if not len(ind): + continue + self.assertRaises(TypeError, ind.__setitem__, 0, ind[0]) + + def test_view(self): + for ind in self.indices.values(): + i_view = ind.view() + self.assertEqual(i_view.name, ind.name) + + def test_compat(self): + for ind in self.indices.values(): + self.assertEqual(ind.tolist(),list(ind)) + + def test_argsort(self): + for k, ind in self.indices.items(): + + # sep teststed + if k in ['catIndex']: + continue + + result = ind.argsort() + expected = np.array(ind).argsort() + self.assert_numpy_array_equal(result, expected) + + def test_pickle(self): + for ind in self.indices.values(): + self.verify_pickle(ind) + ind.name = 'foo' + self.verify_pickle(ind) + + def test_take(self): + indexer = [4, 3, 0, 2] + for k, ind in self.indices.items(): + + # separate + if k in ['boolIndex','tuples','empty']: + continue + + result = ind.take(indexer) + expected = ind[indexer] + self.assertTrue(result.equals(expected)) + + def test_setops_errorcases(self): + for name, idx in compat.iteritems(self.indices): + # # non-iterable input + cases = [0.5, 'xxx'] + methods = [idx.intersection, idx.union, idx.difference, idx.sym_diff] + + for method in methods: + for case in cases: + assertRaisesRegexp(TypeError, + "Input must be Index or array-like", + method, case) + + def test_intersection_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[:5] + second = idx[:3] + intersect = first.intersection(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.intersection(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.intersection([1, 2, 3]) + + def test_union_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[3:] + second = idx[:5] + everything = idx + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.union(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.union([1, 2, 3]) + + def test_difference_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[2:] + second = idx[:4] + answer = idx[4:] + result = first.difference(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.difference(case) + elif isinstance(idx, CategoricalIndex): + pass + elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): + self.assertEqual(result.__class__, answer.__class__) + self.assert_numpy_array_equal(result.asi8, answer.asi8) + else: + result = first.difference(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.difference([1, 2, 3]) + + def test_symmetric_diff(self): + for name, idx in compat.iteritems(self.indices): + first = idx[1:] + second = idx[:-1] + if isinstance(idx, CategoricalIndex): + pass + else: + answer = idx[[0, -1]] + result = first.sym_diff(second) + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.sym_diff(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.sym_diff(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.sym_diff([1, 2, 3]) + + +class TestIndex(Base, tm.TestCase): + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict( + unicodeIndex = tm.makeUnicodeIndex(100), + strIndex = tm.makeStringIndex(100), + dateIndex = tm.makeDateIndex(100), + periodIndex = tm.makePeriodIndex(100), + tdIndex = tm.makeTimedeltaIndex(100), + intIndex = tm.makeIntIndex(100), + floatIndex = tm.makeFloatIndex(100), + boolIndex = Index([True,False]), + catIndex = tm.makeCategoricalIndex(100), + empty = Index([]), + tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) + ) + self.setup_indices() + + def create_index(self): + return Index(list('abcde')) + + def test_new_axis(self): + new_index = self.dateIndex[None, :] + self.assertEqual(new_index.ndim, 2) + tm.assert_isinstance(new_index, np.ndarray) + + def test_copy_and_deepcopy(self): + super(TestIndex, self).test_copy_and_deepcopy() + + new_copy2 = self.intIndex.copy(dtype=int) + self.assertEqual(new_copy2.dtype.kind, 'i') def test_constructor(self): # regular instance creation @@ -297,18 +505,22 @@ def test_constructor_simple_new(self): result = idx._simple_new(idx, 'obj') self.assertTrue(result.equals(idx)) - def test_copy(self): - i = Index([], name='Foo') - i_copy = i.copy() - self.assertEqual(i_copy.name, 'Foo') + def test_view_with_args(self): - def test_view(self): - i = Index([], name='Foo') - i_view = i.view() - self.assertEqual(i_view.name, 'Foo') + restricted = ['unicodeIndex','strIndex','catIndex','boolIndex','empty'] + + for i in restricted: + ind = self.indices[i] + + # with arguments + self.assertRaises(TypeError, lambda : ind.view('i8')) + + # these are ok + for i in list(set(self.indices.keys())-set(restricted)): + ind = self.indices[i] - # with arguments - self.assertRaises(TypeError, lambda : i.view('i8')) + # with arguments + ind.view('i8') def test_legacy_pickle_identity(self): @@ -330,9 +542,6 @@ def test_astype(self): casted = self.intIndex.astype('i8') self.assertEqual(casted.name, 'foobar') - def test_compat(self): - self.strIndex.tolist() - def test_equals(self): # same self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) @@ -459,11 +668,6 @@ def test_nanosecond_index_access(self): self.assertEqual(first_value, x[Timestamp(np.datetime64('2013-01-01 00:00:00.000000050+0000', 'ns'))]) - def test_argsort(self): - result = self.strIndex.argsort() - expected = np.array(self.strIndex).argsort() - self.assert_numpy_array_equal(result, expected) - def test_comparators(self): index = self.dateIndex element = index[len(index) // 2] @@ -546,16 +750,12 @@ def test_intersection(self): first = self.strIndex[:20] second = self.strIndex[:10] intersect = first.intersection(second) - self.assertTrue(tm.equalContents(intersect, second)) # Corner cases inter = first.intersection(first) self.assertIs(inter, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) - idx1 = Index([1, 2, 3, 4, 5], name='idx') # if target has the same name, it is preserved idx2 = Index([3, 4, 5, 6, 7], name='idx') @@ -597,6 +797,12 @@ def test_union(self): union = first.union(second) self.assertTrue(tm.equalContents(union, everything)) + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + # Corner cases union = first.union(first) self.assertIs(union, first) @@ -607,9 +813,6 @@ def test_union(self): union = Index([]).union(first) self.assertIs(union, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) - # preserve names first.name = 'A' second.name = 'A' @@ -625,6 +828,10 @@ def test_add(self): # - API change GH 8226 with tm.assert_produces_warning(): self.strIndex + self.strIndex + with tm.assert_produces_warning(): + self.strIndex + self.strIndex.tolist() + with tm.assert_produces_warning(): + self.strIndex.tolist() + self.strIndex firstCat = self.strIndex.union(self.dateIndex) secondCat = self.strIndex.union(self.strIndex) @@ -640,6 +847,13 @@ def test_add(self): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) + # test add and radd + idx = Index(list('abc')) + expected = Index(['a1', 'b1', 'c1']) + self.assert_index_equal(idx + '1', expected) + expected = Index(['1a', '1b', '1c']) + self.assert_index_equal('1' + idx, expected) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) @@ -707,9 +921,6 @@ def test_difference(self): self.assertEqual(len(result), 0) self.assertEqual(result.name, first.name) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) - def test_symmetric_diff(self): # smoke idx1 = Index([1, 2, 3, 4], name='idx1') @@ -756,26 +967,17 @@ def test_symmetric_diff(self): self.assertTrue(tm.equalContents(result, expected)) self.assertEqual(result.name, 'new_name') - # other isn't iterable - with tm.assertRaises(TypeError): - Index(idx1,dtype='object') - 1 - - def test_pickle(self): - - self.verify_pickle(self.strIndex) - self.strIndex.name = 'foo' - self.verify_pickle(self.strIndex) - self.verify_pickle(self.dateIndex) - def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) self.assertFalse(self.strIndex.is_numeric()) self.assertTrue(self.intIndex.is_numeric()) self.assertTrue(self.floatIndex.is_numeric()) + self.assertFalse(self.catIndex.is_numeric()) def test_is_object(self): self.assertTrue(self.strIndex.is_object()) self.assertTrue(self.boolIndex.is_object()) + self.assertFalse(self.catIndex.is_object()) self.assertFalse(self.intIndex.is_object()) self.assertFalse(self.dateIndex.is_object()) self.assertFalse(self.floatIndex.is_object()) @@ -839,12 +1041,6 @@ def test_format_none(self): idx.format() self.assertIsNone(idx[3]) - def test_take(self): - indexer = [4, 3, 0, 2] - result = self.dateIndex.take(indexer) - expected = self.dateIndex[indexer] - self.assertTrue(result.equals(expected)) - def test_logical_compat(self): idx = self.create_index() self.assertEqual(idx.all(), idx.values.all()) @@ -857,6 +1053,7 @@ def _check_method_works(self, method): method(self.strIndex) method(self.intIndex) method(self.tuples) + method(self.catIndex) def test_get_indexer(self): idx1 = Index([1, 2, 3, 4, 5]) @@ -1036,20 +1233,43 @@ def check_slice(in_slice, expected): def test_drop(self): n = len(self.strIndex) - dropped = self.strIndex.drop(self.strIndex[lrange(5, 10)]) + drop = self.strIndex[lrange(5, 10)] + dropped = self.strIndex.drop(drop) expected = self.strIndex[lrange(5) + lrange(10, n)] self.assertTrue(dropped.equals(expected)) self.assertRaises(ValueError, self.strIndex.drop, ['foo', 'bar']) + self.assertRaises(ValueError, self.strIndex.drop, ['1', 'bar']) + + # errors='ignore' + mixed = drop.tolist() + ['foo'] + dropped = self.strIndex.drop(mixed, errors='ignore') + expected = self.strIndex[lrange(5) + lrange(10, n)] + self.assert_index_equal(dropped, expected) + + dropped = self.strIndex.drop(['foo', 'bar'], errors='ignore') + expected = self.strIndex[lrange(n)] + self.assert_index_equal(dropped, expected) dropped = self.strIndex.drop(self.strIndex[0]) expected = self.strIndex[1:] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) ser = Index([1, 2, 3]) dropped = ser.drop(1) expected = Index([2, 3]) - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + # errors='ignore' + self.assertRaises(ValueError, ser.drop, [3, 4]) + + dropped = ser.drop(4, errors='ignore') + expected = Index([1, 2, 3]) + self.assert_index_equal(dropped, expected) + + dropped = ser.drop([3, 4, 5], errors='ignore') + expected = Index([1, 2]) + self.assert_index_equal(dropped, expected) def test_tuple_union_bug(self): import pandas @@ -1174,6 +1394,49 @@ def test_join_self(self): for kind in kinds: joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Index([getattr(str, method)(x) for x in idx.values]) + tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected) + + # create a few instances that are not able to use .str accessor + indices = [Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), + PeriodIndex(start='2000', end='2010', freq='A')] + for idx in indices: + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + idx.str.repeat(2) + + idx = Index(['a b c', 'd e', 'f']) + expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']]) + tm.assert_index_equal(idx.str.split(), expected) + tm.assert_index_equal(idx.str.split(expand=False), expected) + + expected = MultiIndex.from_tuples([('a', 'b', 'c'), + ('d', 'e', np.nan), + ('f', np.nan, np.nan)]) + tm.assert_index_equal(idx.str.split(expand=True), expected) + + # test boolean case, should return np.array instead of boolean Index + idx = Index(['a1', 'a2', 'b1', 'b2']) + expected = np.array([True, True, False, False]) + self.assert_array_equal(idx.str.startswith('a'), expected) + self.assertIsInstance(idx.str.startswith('a'), np.ndarray) + s = Series(range(4), index=idx) + expected = Series(range(2), index=['a1', 'a2']) + tm.assert_series_equal(s[s.index.str.startswith('a')], expected) + + def test_tab_completion(self): + # GH 9910 + idx = Index(list('abcd')) + self.assertTrue('str' in dir(idx)) + + idx = Index(range(4)) + self.assertTrue('str' not in dir(idx)) def test_indexing_doesnt_change_class(self): idx = Index([1, 2, 3, 'a', 'b', 'c']) @@ -1263,6 +1526,387 @@ def test_groupby(self): exp = {1: [0, 1], 2: [2, 3, 4]} tm.assert_dict_equal(groups, exp) + def test_equals_op(self): + # For issue #9785 + index_a = Index(['foo', 'bar', 'baz']) + index_b = Index(['foo', 'bar', 'baz', 'qux']) + # Testing Numpy Results Equivelent + assert_array_equal( + index_a.equals(index_a), + index_a == index_a + ) + assert_array_equal( + index_a.equals(index_b), + index_a == index_b, + ) + assert_array_equal( + index_b.equals(index_a), + index_b == index_a, + ) + +class TestCategoricalIndex(Base, tm.TestCase): + _holder = CategoricalIndex + + def setUp(self): + self.indices = dict(catIndex = tm.makeCategoricalIndex(100)) + self.setup_indices() + + def create_index(self, categories=None, ordered=False): + if categories is None: + categories = list('cab') + return CategoricalIndex(list('aabbca'), categories=categories, ordered=ordered) + + def test_construction(self): + + ci = self.create_index(categories=list('abcd')) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + result = Index(ci.values) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + # empty + result = CategoricalIndex(categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([],dtype='int8')) + self.assertFalse(result.ordered) + + # passing categories + result = CategoricalIndex(list('aabbca'),categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + + c = pd.Categorical(list('aabbca')) + result = CategoricalIndex(c) + self.assertTrue(result.categories.equals(Index(list('abc')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(c,categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + ci = CategoricalIndex(c,categories=list('abcd')) + result = CategoricalIndex(ci) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab')) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab'), ordered=True) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertTrue(result.ordered) + + # turn me to an Index + result = Index(np.array(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = self.create_index(categories=list('abc')) + + result = Index(np.array(ci), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + result = Index(np.array(ci).tolist(), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + # these are generally only equal when the categories are reordered + ci = self.create_index() + + result = Index(np.array(ci), dtype='category').reorder_categories(ci.categories) + tm.assert_index_equal(result,ci,exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0,1,2], categories=[0,1,2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_disallow_set_ops(self): + + # GH 10039 + # set ops (+/-) raise TypeError + idx = pd.Index(pd.Categorical(['a', 'b'])) + + self.assertRaises(TypeError, lambda : idx - idx) + self.assertRaises(TypeError, lambda : idx + idx) + self.assertRaises(TypeError, lambda : idx - ['a','b']) + self.assertRaises(TypeError, lambda : idx + ['a','b']) + self.assertRaises(TypeError, lambda : ['a','b'] - idx) + self.assertRaises(TypeError, lambda : ['a','b'] + idx) + + def test_method_delegation(self): + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.set_categories(list('cab')) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.rename_categories(list('efg')) + tm.assert_index_equal(result, CategoricalIndex(list('ffggef'), categories=list('efg'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.add_categories(['d']) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabd'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.remove_categories(['c']) + tm.assert_index_equal(result, CategoricalIndex(list('aabb') + [np.nan] + ['a'], categories=list('ab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_unordered() + tm.assert_index_equal(result, ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_ordered() + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabdef'), ordered=True)) + + # invalid + self.assertRaises(ValueError, lambda : ci.set_categories(list('cab'), inplace=True)) + + def test_contains(self): + + ci = self.create_index(categories=list('cabdef')) + + self.assertTrue('a' in ci) + self.assertTrue('z' not in ci) + self.assertTrue('e' not in ci) + self.assertTrue(np.nan not in ci) + + # assert codes NOT in index + self.assertFalse(0 in ci) + self.assertFalse(1 in ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef') + [np.nan]) + self.assertFalse(np.nan in ci) + + ci = CategoricalIndex(list('aabbca') + [np.nan], categories=list('cabdef') + [np.nan]) + self.assertTrue(np.nan in ci) + + def test_min_max(self): + + ci = self.create_index(ordered=False) + self.assertRaises(TypeError, lambda : ci.min()) + self.assertRaises(TypeError, lambda : ci.max()) + + ci = self.create_index(ordered=True) + + self.assertEqual(ci.min(),'c') + self.assertEqual(ci.max(),'b') + + def test_append(self): + + ci = self.create_index() + categories = ci.categories + + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result,ci,exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result,ci,exact=True) + + # empty + result = ci.append([]) + tm.assert_index_equal(result,ci,exact=True) + + # appending with different categories or reoreded is not ok + self.assertRaises(TypeError, lambda : ci.append(ci.values.set_categories(list('abcd')))) + self.assertRaises(TypeError, lambda : ci.append(ci.values.reorder_categories(list('abc')))) + + # with objects + result = ci.append(['c','a']) + expected = CategoricalIndex(list('aabbcaca'), categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid objects + self.assertRaises(TypeError, lambda : ci.append(['a','d'])) + + def test_insert(self): + + ci = self.create_index() + categories = ci.categories + + #test 0th element + result = ci.insert(0, 'a') + expected = CategoricalIndex(list('aaabbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test Nth element that follows Python list behavior + result = ci.insert(-1, 'a') + expected = CategoricalIndex(list('aabbcaa'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test empty + result = CategoricalIndex(categories=categories).insert(0, 'a') + expected = CategoricalIndex(['a'],categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid + self.assertRaises(TypeError, lambda : ci.insert(0,'d')) + + def test_delete(self): + + ci = self.create_index() + categories = ci.categories + + result = ci.delete(0) + expected = CategoricalIndex(list('abbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + result = ci.delete(-1) + expected = CategoricalIndex(list('aabbc'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = ci.delete(10) + + def test_astype(self): + + ci = self.create_index() + result = ci.astype('category') + tm.assert_index_equal(result,ci,exact=True) + + result = ci.astype(object) + self.assertTrue(result.equals(Index(np.array(ci)))) + + # this IS equal, but not the same class + self.assertTrue(result.equals(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_reindex_base(self): + + # determined by cat ordering + idx = self.create_index() + expected = np.array([4,0,1,5,2,3]) + + actual = idx.get_indexer(idx) + assert_array_equal(expected, actual) + + with tm.assertRaisesRegexp(ValueError, 'Invalid fill method'): + idx.get_indexer(idx, method='invalid') + + def test_reindexing(self): + + ci = self.create_index() + oidx = Index(np.array(ci)) + + for n in [1,2,5,len(ci)]: + finder = oidx[np.random.randint(0,len(ci),size=n)] + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + assert_array_equal(expected, actual) + + def test_duplicates(self): + + idx = CategoricalIndex([0, 0, 0]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) + + def test_get_indexer(self): + + idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc')) + idx2 = CategoricalIndex(list('abf')) + + for indexer in [idx2, list('abf'), Index(list('abf'))]: + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [0, 1, 2, -1]) + + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='pad')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='backfill')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='nearest')) + + def test_repr_roundtrip(self): + + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + str(ci) + tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + + # formatting + if compat.PY3: + str(ci) + else: + compat.text_type(ci) + + # long format + # this is not reprable + ci = CategoricalIndex(np.random.randint(0,5,size=100)) + if compat.PY3: + str(ci) + else: + compat.text_type(ci) + + def test_isin(self): + + ci = CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]) + self.assert_numpy_array_equal(ci.isin(['c']),np.array([False,False,False,True,False,False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b']),np.array([True]*5 + [False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b',np.nan]),np.array([True]*6)) + + # mismatched categorical -> coerced to ndarray so doesn't matter + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('abcdefghi'))),np.array([True]*6)) + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('defghi'))),np.array([False]*5 + [True])) + + def test_identical(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + self.assertTrue(ci1.identical(ci1)) + self.assertTrue(ci1.identical(ci1.copy())) + self.assertFalse(ci1.identical(ci2)) + + def test_equals(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + + self.assertTrue(ci1.equals(ci1)) + self.assertFalse(ci1.equals(ci2)) + self.assertTrue(ci1.equals(ci1.astype(object))) + self.assertTrue(ci1.astype(object).equals(ci1)) + + self.assertTrue((ci1 == ci1).all()) + self.assertFalse((ci1 != ci1).all()) + self.assertFalse((ci1 > ci1).all()) + self.assertFalse((ci1 < ci1).all()) + self.assertTrue((ci1 <= ci1).all()) + self.assertTrue((ci1 >= ci1).all()) + + self.assertFalse((ci1 == 1).all()) + self.assertTrue((ci1 == Index(['a','b'])).all()) + self.assertTrue((ci1 == ci1.values).all()) + + # invalid comparisons + self.assertRaises(TypeError, lambda : ci1 == Index(['a','b','c'])) + self.assertRaises(TypeError, lambda : ci1 == ci2) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, ordered=False)) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, categories=list('abc'))) + + # tests + # make sure that we are testing for category inclusion properly + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b']).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b',np.nan]).equals(list('aabca'))) + + self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan])) + class Numeric(Base): @@ -1336,24 +1980,38 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index('A B C D E F'.split()) + dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + self.assertEqual(idx.groupby(to_groupby), + {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + + to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), + pd.NaT, pd.NaT, + datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values + + ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')])) + expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]} + self.assertEqual(idx.groupby(to_groupby), expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index _multiprocess_can_split_ = True def setUp(self): - self.mixed = Float64Index([1.5, 2, 3, 4, 5]) - self.float = Float64Index(np.arange(5) * 2.5) + self.indices = dict(mixed = Float64Index([1.5, 2, 3, 4, 5]), + float = Float64Index(np.arange(5) * 2.5)) + self.setup_indices() def create_index(self): return Float64Index(np.arange(5,dtype='float64')) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.float).__name__): - hash(self.float) - def test_repr_roundtrip(self): for ind in (self.mixed, self.float): tm.assert_index_equal(eval(repr(ind)), ind) @@ -1519,7 +2177,8 @@ class TestInt64Index(Numeric, tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.index = Int64Index(np.arange(0, 20, 2)) + self.indices = dict(index = Int64Index(np.arange(0, 20, 2))) + self.setup_indices() def create_index(self): return Int64Index(np.arange(5,dtype='int64')) @@ -1566,27 +2225,23 @@ def test_constructor_corner(self): with tm.assertRaisesRegexp(TypeError, 'casting'): Int64Index(arr_with_floats) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - def test_copy(self): i = Int64Index([], name='Foo') i_copy = i.copy() self.assertEqual(i_copy.name, 'Foo') def test_view(self): + super(TestInt64Index, self).test_view() + i = Int64Index([], name='Foo') i_view = i.view() self.assertEqual(i_view.name, 'Foo') i_view = i.view('i8') - tm.assert_index_equal(i, Int64Index(i_view)) + tm.assert_index_equal(i, Int64Index(i_view, name='Foo')) i_view = i.view(Int64Index) - tm.assert_index_equal(i, Int64Index(i_view)) + tm.assert_index_equal(i, Int64Index(i_view, name='Foo')) def test_coerce_list(self): # coerce things @@ -1950,7 +2605,7 @@ def test_print_unicode_columns(self): def test_repr_summary(self): with cf.option_context('display.max_seq_items', 10): r = repr(pd.Index(np.arange(1000))) - self.assertTrue(len(r) < 100) + self.assertTrue(len(r) < 200) self.assertTrue("..." in r) def test_repr_roundtrip(self): @@ -1977,7 +2632,25 @@ def test_slice_keep_name(self): class DatetimeLike(Base): + def test_str(self): + + # test the string repr + idx = self.create_index() + idx.name = 'foo' + self.assertFalse("length=%s" % len(idx) in str(idx)) + self.assertTrue("'foo'" in str(idx)) + self.assertTrue(idx.__class__.__name__ in str(idx)) + + if hasattr(idx,'tz'): + if idx.tz is not None: + self.assertTrue("tz='%s'" % idx.tz in str(idx)) + else: + self.assertTrue("tz=None" in str(idx)) + if hasattr(idx,'freq'): + self.assertTrue("freq='%s'" % idx.freqstr in str(idx)) + def test_view(self): + super(DatetimeLike, self).test_view() i = self.create_index() @@ -1993,6 +2666,10 @@ class TestDatetimeIndex(DatetimeLike, tm.TestCase): _holder = DatetimeIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeDateIndex(10)) + self.setup_indices() + def create_index(self): return date_range('20130101',periods=5) @@ -2106,13 +2783,47 @@ def test_time_overflow_for_32bit_machines(self): idx2 = pd.date_range(end='2000', periods=periods, freq='S') self.assertEqual(len(idx2), periods) + def test_intersection(self): + first = self.index + second = self.index[5:] + intersect = first.intersection(second) + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + third = Index(['a', 'b', 'c']) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + self.assert_index_equal(result, expected) + + def test_union(self): + first = self.index[:5] + second = self.index[5:] + everything = self.index + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makePeriodIndex(10)) + self.setup_indices() + def create_index(self): - return period_range('20130101',periods=5,freq='D') + return period_range('20130101', periods=5, freq='D') def test_pickle_compat_construction(self): pass @@ -2145,6 +2856,10 @@ class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeTimedeltaIndex(10)) + self.setup_indices() + def create_index(self): return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1) @@ -2219,9 +2934,10 @@ def setUp(self): major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) self.index_names = ['first', 'second'] - self.index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], - names=self.index_names, verify_integrity=False) + self.indices = dict(index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=self.index_names, verify_integrity=False)) + self.setup_indices() def create_index(self): return self.index @@ -2257,13 +2973,7 @@ def test_labels_dtypes(self): self.assertTrue((i.labels[0]>=0).all()) self.assertTrue((i.labels[1]>=0).all()) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - - def test_set_names_and_rename(self): + def test_set_name_methods(self): # so long as these are synonyms, we don't need to test set_names self.assertEqual(self.index.rename, self.index.set_names) new_names = [name + "SUFFIX" for name in self.index_names] @@ -3402,6 +4112,12 @@ def test_difference(self): # - API change GH 8226 with tm.assert_produces_warning(): first - self.index[-3:] + with tm.assert_produces_warning(): + self.index[-3:] - first + with tm.assert_produces_warning(): + self.index[-3:] - first.tolist() + + self.assertRaises(TypeError, lambda : first.tolist() - self.index[-3:]) expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), sortorder=0, @@ -3529,21 +4245,50 @@ def test_drop(self): dropped2 = self.index.drop(index) expected = self.index[[0, 2, 3, 5]] - self.assertTrue(dropped.equals(expected)) - self.assertTrue(dropped2.equals(expected)) + self.assert_index_equal(dropped, expected) + self.assert_index_equal(dropped2, expected) dropped = self.index.drop(['bar']) expected = self.index[[0, 1, 3, 4, 5]] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop('foo') + expected = self.index[[2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) index = MultiIndex.from_tuples([('bar', 'two')]) self.assertRaises(KeyError, self.index.drop, [('bar', 'two')]) self.assertRaises(KeyError, self.index.drop, index) + self.assertRaises(KeyError, self.index.drop, ['foo', 'two']) + + # partially correct argument + mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')]) + self.assertRaises(KeyError, self.index.drop, mixed_index) + + # error='ignore' + dropped = self.index.drop(index, errors='ignore') + expected = self.index[[0, 1, 2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop(mixed_index, errors='ignore') + expected = self.index[[0, 1, 2, 3, 5]] + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop(['foo', 'two'], errors='ignore') + expected = self.index[[2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) # mixed partial / full drop dropped = self.index.drop(['foo', ('qux', 'one')]) expected = self.index[[2, 3, 5]] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + # mixed partial / full drop / error='ignore' + mixed_index = ['foo', ('qux', 'one'), 'two'] + self.assertRaises(KeyError, self.index.drop, mixed_index) + dropped = self.index.drop(mixed_index, errors='ignore') + expected = self.index[[2, 3, 5]] + self.assert_index_equal(dropped, expected) def test_droplevel_with_names(self): index = self.index[self.index.get_loc('foo')] @@ -3734,7 +4479,7 @@ def test_reindex_level(self): assertRaisesRegexp(TypeError, "Fill method not supported", idx.reindex, idx, method='bfill', level='first') - def test_has_duplicates(self): + def test_duplicates(self): self.assertFalse(self.index.has_duplicates) self.assertTrue(self.index.append(self.index).has_duplicates) @@ -3848,7 +4593,25 @@ def test_repr_with_unicode_data(self): self.assertFalse("\\u" in repr(index)) # we don't want unicode-escaped def test_repr_roundtrip(self): - tm.assert_index_equal(eval(repr(self.index)), self.index) + + mi = MultiIndex.from_product([list('ab'),range(3)],names=['first','second']) + str(mi) + tm.assert_index_equal(eval(repr(mi)),mi,exact=True) + + # formatting + if compat.PY3: + str(mi) + else: + compat.text_type(mi) + + # long format + mi = MultiIndex.from_product([list('abcdefg'),range(10)],names=['first','second']) + result = str(mi) + tm.assert_index_equal(eval(repr(mi)),mi,exact=True) + + def test_str(self): + # tested elsewhere + pass def test_unicode_string_with_unicode(self): d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} @@ -3989,6 +4752,61 @@ def test_groupby(self): exp = dict((key, [key]) for key in self.index) tm.assert_dict_equal(groups, exp) + def test_index_name_retained(self): + # GH9857 + result = pd.DataFrame({'x': [1, 2, 6], + 'y': [2, 2, 8], + 'z': [-5, 0, 5]}) + result = result.set_index('z') + result.loc[10] = [9, 10] + df_expected = pd.DataFrame({'x': [1, 2, 6, 9], + 'y': [2, 2, 8, 10], + 'z': [-5, 0, 5, 10]}) + df_expected = df_expected.set_index('z') + tm.assert_frame_equal(result, df_expected) + + def test_equals_operator(self): + # For issue #9785 + self.assertTrue((self.index == self.index).all()) + + def test_index_compare(self): + # For issue #9785 + index_unequal = Index(['foo', 'bar', 'baz']) + index_equal = Index([ + ('foo', 'one'), ('foo', 'two'), ('bar', 'one'), + ('baz', 'two'), ('qux', 'one'), ('qux', 'two') + ], tupleize_cols=False) + # Testing Numpy Results Equivelent + assert_array_equal( + index_unequal.equals(self.index), + index_unequal == self.index, + err_msg = 'Index compared with MultiIndex failed', + ) + assert_array_equal( + self.index.equals(index_unequal), + self.index == index_unequal, + err_msg = 'MultiIndex compared with Index failed', + ) + assert_array_equal( + self.index.equals(index_equal), + self.index == index_equal, + err_msg = 'MultiIndex compared with Similar Index failed', + ) + assert_array_equal( + index_equal.equals(self.index), + index_equal == self.index, + err_msg = 'Index compared with Similar MultiIndex failed', + ) + # Testing that the result is true for the index_equal case + self.assertTrue( + (self.index == index_equal).all(), + msg='Assert Index compared with Similar MultiIndex match' + ) + self.assertTrue( + (index_equal == self.index).all(), + msg='Assert MultiIndex compared with Similar Index match' + ) + def test_get_combined_index(): from pandas.core.index import _get_combined_index diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index ee6140828882c..c998ce65791a3 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1063,6 +1063,7 @@ def test_loc_setitem_consistency(self): # empty (essentially noops) expected = DataFrame(columns=['x', 'y']) + expected['x'] = expected['x'].astype(np.int64) df = DataFrame(columns=['x', 'y']) df.loc[:, 'x'] = 1 assert_frame_equal(df,expected) @@ -1438,6 +1439,13 @@ def test_iloc_setitem_series(self): result = s.iloc[:4] assert_series_equal(result, expected) + s= Series([-1]*6) + s.iloc[0::2]= [0,2,4] + s.iloc[1::2]= [1,3,5] + result = s + expected= Series([0,1,2,3,4,5]) + assert_series_equal(result, expected) + def test_iloc_setitem_list_of_lists(self): # GH 7551 @@ -2366,6 +2374,7 @@ def test_dups_fancy_indexing(self): rows = ['C','B','E'] expected = DataFrame({'test' : [11,9,np.nan], 'test1': [7.,6,np.nan], 'other': ['d','c',np.nan]},index=rows) + result = df.ix[rows] assert_frame_equal(result, expected) @@ -3368,7 +3377,7 @@ def f(): expected = DataFrame(columns=['foo']) def f(): df = DataFrame() - df['foo'] = Series([]) + df['foo'] = Series([], dtype='object') return df assert_frame_equal(f(), expected) def f(): @@ -3378,9 +3387,12 @@ def f(): assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = Series(range(len(df))) + df['foo'] = df.index return df assert_frame_equal(f(), expected) + + expected = DataFrame(columns=['foo']) + expected['foo'] = expected['foo'].astype('float64') def f(): df = DataFrame() df['foo'] = [] @@ -3388,7 +3400,7 @@ def f(): assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = df.index + df['foo'] = Series(range(len(df))) return df assert_frame_equal(f(), expected) def f(): @@ -3421,14 +3433,21 @@ def f(): # GH5720, GH5744 # don't create rows when empty + expected = DataFrame(columns=['A','B','New']) + expected['A'] = expected['A'].astype('int64') + expected['B'] = expected['B'].astype('float64') + expected['New'] = expected['New'].astype('float64') df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] y['New'] = np.nan - assert_frame_equal(y,DataFrame(columns=['A','B','New'])) + assert_frame_equal(y,expected) + #assert_frame_equal(y,expected) + expected = DataFrame(columns=['a','b','c c','d']) + expected['d'] = expected['d'].astype('int64') df = DataFrame(columns=['a', 'b', 'c c']) df['d'] = 3 - assert_frame_equal(df,DataFrame(columns=['a','b','c c','d'])) + assert_frame_equal(df,expected) assert_series_equal(df['c c'],Series(name='c c',dtype=object)) # reindex columns is ok @@ -3436,6 +3455,9 @@ def f(): y = df[df.A > 5] result = y.reindex(columns=['A','B','C']) expected = DataFrame(columns=['A','B','C']) + expected['A'] = expected['A'].astype('int64') + expected['B'] = expected['B'].astype('float64') + expected['C'] = expected['C'].astype('float64') assert_frame_equal(result,expected) # GH 5756 @@ -4411,6 +4433,250 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: s.ix[::0]) + def test_indexing_assignment_dict_already_exists(self): + df = pd.DataFrame({'x': [1, 2, 6], + 'y': [2, 2, 8], + 'z': [-5, 0, 5]}).set_index('z') + expected = df.copy() + rhs = dict(x=9, y=99) + df.loc[5] = rhs + expected.loc[5] = [9, 99] + tm.assert_frame_equal(df, expected) + + def test_indexing_dtypes_on_empty(self): + # Check that .iloc and .ix return correct dtypes GH9983 + df = DataFrame({'a':[1,2,3],'b':['b','b2','b3']}) + df2 = df.ix[[],:] + + self.assertEqual(df2.loc[:,'a'].dtype, np.int64) + assert_series_equal(df2.loc[:,'a'], df2.iloc[:,0]) + assert_series_equal(df2.loc[:,'a'], df2.ix[:,0]) + + + +class TestCategoricalIndex(tm.TestCase): + + def setUp(self): + + self.df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + self.df2 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cabe')) }).set_index('B') + self.df3 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=True) }).set_index('B') + self.df4 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=False) }).set_index('B') + + + def test_loc_scalar(self): + + result = self.df.loc['a'] + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(result, expected) + + + df = self.df.copy() + df.loc['a'] = 20 + expected = DataFrame({'A' : [20,20,2,3,4,20], + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(df, expected) + + # value not in the categories + self.assertRaises(KeyError, lambda : df.loc['d']) + + def f(): + df.loc['d'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','A'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','C'] = 10 + self.assertRaises(TypeError, f) + + def test_loc_listlike(self): + + # list of labels + result = self.df.loc[['c','a']] + expected = self.df.iloc[[4,0,1,5]] + assert_frame_equal(result, expected) + + result = self.df2.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + # element in the categories but not in the values + self.assertRaises(KeyError, lambda : self.df2.loc['e']) + + # assign is ok + df = self.df2.copy() + df.loc['e'] = 20 + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,20], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + df = self.df2.copy() + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + + # not all labels in the categories + self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']]) + + def test_read_only_source(self): + # GH 10043 + rw_array = np.eye(10) + rw_df = DataFrame(rw_array) + + ro_array = np.eye(10) + ro_array.setflags(write=False) + ro_df = DataFrame(ro_array) + + assert_frame_equal(rw_df.iloc[[1,2,3]],ro_df.iloc[[1,2,3]]) + assert_frame_equal(rw_df.iloc[[1]],ro_df.iloc[[1]]) + assert_series_equal(rw_df.iloc[1],ro_df.iloc[1]) + assert_frame_equal(rw_df.iloc[1:3],ro_df.iloc[1:3]) + + assert_frame_equal(rw_df.loc[[1,2,3]],ro_df.loc[[1,2,3]]) + assert_frame_equal(rw_df.loc[[1]],ro_df.loc[[1]]) + assert_series_equal(rw_df.loc[1],ro_df.loc[1]) + assert_frame_equal(rw_df.loc[1:3],ro_df.loc[1:3]) + + def test_reindexing(self): + + # reindexing + # convert to a regular index + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['d']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['d']) }).set_index('B') + assert_frame_equal(result, expected) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list('cabe') + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a'],categories=cats)) + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + # give back the type of categorical that we received + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d'])) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B') + assert_frame_equal(result, expected) + + # passed duplicate indexers are not allowed + self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a'])) + + # args NotImplemented ATM + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],method='ffill')) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],level=1)) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],limit=2)) + + def test_loc_slice(self): + + # slicing + # not implemented ATM + # GH9748 + + self.assertRaises(TypeError, lambda : self.df.loc[1:5]) + + #result = df.loc[1:5] + #expected = df.iloc[[1,2,3,4]] + #assert_frame_equal(result, expected) + + def test_boolean_selection(self): + + df3 = self.df3 + df4 = self.df4 + + result = df3[df3.index == 'a'] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 'a'] + expected = df4.iloc[[]] + assert_frame_equal(result,expected) + + result = df3[df3.index == 1] + expected = df3.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 1] + expected = df4.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + # since we have an ordered categorical + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=True, + # name=u'B') + result = df3[df3.index < 2] + expected = df3.iloc[[4]] + assert_frame_equal(result,expected) + + result = df3[df3.index > 1] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + # unordered + # cannot be compared + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=False, + # name=u'B') + self.assertRaises(TypeError, lambda : df4[df4.index < 2]) + self.assertRaises(TypeError, lambda : df4[df4.index > 1]) class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 45f089f5e0a53..36585abd1b98f 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): elif typestr in ('object', 'string', 'O'): values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], shape) - elif typestr in ('bool'): + elif typestr in ('b','bool',): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') - elif typestr in ('category'): + elif typestr in ('category',): values = Categorical([1,1,2,2,3,3,3,3,4,4]) - elif typestr in ('category2'): + elif typestr in ('category2',): values = Categorical(['a','a','a','a','b','b','c','c','c','d']) elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 @@ -751,6 +751,25 @@ def test_equals(self): bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) + def test_equals_block_order_different_dtypes(self): + # GH 9330 + + mgr_strings = [ + "a:i8;b:f8", # basic case + "a:i8;b:f8;c:c8;d:b", # many types + "a:i8;e:dt;f:td;g:string", # more types + "a:i8;b:category;c:category2;d:category2", # categories + "c:sparse;d:sparse_na;b:f8", # sparse + ] + + for mgr_string in mgr_strings: + bm = create_mgr(mgr_string) + block_perms = itertools.permutations(bm.blocks) + for bm_perm in block_perms: + bm_this = BlockManager(bm_perm, bm.axes) + self.assertTrue(bm.equals(bm_this)) + self.assertTrue(bm_this.equals(bm)) + def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index bb860269c5144..6d9bea29cf44d 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -6,20 +6,42 @@ import pandas as pd from pandas.lib import isscalar, item_from_zerodim, max_len_string_array import pandas.util.testing as tm -from pandas.compat import u +from pandas.compat import u, PY2 + class TestMisc(tm.TestCase): def test_max_len_string_array(self): - arr = np.array(['foo','b',np.nan],dtype='object') - self.assertTrue(max_len_string_array(arr),3) + arr = a = np.array(['foo', 'b', np.nan], dtype='object') + self.assertTrue(max_len_string_array(arr), 3) # unicode - arr = arr.astype('U') - self.assertTrue(max_len_string_array(arr),3) + arr = a.astype('U').astype(object) + self.assertTrue(max_len_string_array(arr), 3) + + # bytes for python3 + arr = a.astype('S').astype(object) + self.assertTrue(max_len_string_array(arr), 3) + + # raises + tm.assertRaises(TypeError, + lambda: max_len_string_array(arr.astype('U'))) + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(pd.lib.infer_dtype(arr), compare) + class TestIsscalar(tm.TestCase): + def test_isscalar_builtin_scalars(self): self.assertTrue(isscalar(None)) self.assertTrue(isscalar(True)) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e6a0f5d7ef45d..b2efc20aa0694 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -426,8 +426,15 @@ def test_frame_setitem_multi_column(self): # it broadcasts df['B', '1'] = [1, 2, 3] df['A'] = df['B', '1'] - assert_series_equal(df['A', '1'], df['B', '1']) - assert_series_equal(df['A', '2'], df['B', '1']) + + sliced_a1 = df['A', '1'] + sliced_a2 = df['A', '2'] + sliced_b1 = df['B', '1'] + assert_series_equal(sliced_a1, sliced_b1, check_names=False) + assert_series_equal(sliced_a2, sliced_b1, check_names=False) + self.assertEqual(sliced_a1.name, ('A', '1')) + self.assertEqual(sliced_a2.name, ('A', '2')) + self.assertEqual(sliced_b1.name, ('B', '1')) def test_getitem_tuple_plus_slice(self): # GH #671 @@ -461,7 +468,9 @@ def test_getitem_multilevel_index_tuple_unsorted(self): df = df.set_index(index_columns) query_index = df.index[:1] rs = df.ix[query_index, "data"] - xp = Series(['x'], index=MultiIndex.from_tuples([(0, 1, 0)])) + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) + xp = Series(['x'], index=xp_idx, name='data') assert_series_equal(rs, xp) def test_xs(self): @@ -865,7 +874,7 @@ def test_count_level_series(self): def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) - expected = Series(0, index=s.index.levels[0]) + expected = Series(0, index=s.index.levels[0], name='A') assert_series_equal(result, expected) df = self.frame[:0] @@ -982,7 +991,9 @@ def test_stack_mixed_dtype(self): df = df.sortlevel(1, axis=1) stacked = df.stack() - assert_series_equal(stacked['foo'], df['foo'].stack()) + result = df['foo'].stack() + assert_series_equal(stacked['foo'], result, check_names=False) + self.assertIs(result.name, None) self.assertEqual(stacked['bar'].dtype, np.float_) def test_unstack_bug(self): @@ -1430,11 +1441,13 @@ def test_count(self): result = series.count(level='b') expect = self.series.count(level=1) - assert_series_equal(result, expect) + assert_series_equal(result, expect, check_names=False) + self.assertEqual(result.index.name, 'b') result = series.count(level='a') expect = self.series.count(level=0) - assert_series_equal(result, expect) + assert_series_equal(result, expect, check_names=False) + self.assertEqual(result.index.name, 'a') self.assertRaises(KeyError, series.count, 'x') self.assertRaises(KeyError, frame.count, level='x') @@ -1738,12 +1751,12 @@ def test_mixed_depth_get(self): result = df['a'] expected = df['a', '', ''] - assert_series_equal(result, expected) + assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, 'a') result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] - assert_series_equal(result, expected) + assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, ('routine1', 'result1')) def test_mixed_depth_insert(self): @@ -1825,7 +1838,7 @@ def test_mixed_depth_pop(self): df2 = df.copy() result = df1.pop('a') expected = df2.pop(('a', '', '')) - assert_series_equal(expected, result) + assert_series_equal(expected, result, check_names=False) assert_frame_equal(df1, df2) self.assertEqual(result.name, 'a') diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 2a605cba8a6c0..1adb8a5d9217c 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,7 +5,7 @@ import numpy as np -from pandas.core.common import isnull +from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -323,6 +323,32 @@ def test_nanmean(self): allow_complex=False, allow_obj=False, allow_str=False, allow_date=False, allow_tdelta=True) + def test_nanmean_overflow(self): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + from pandas import Series + + # numpy < 1.9.0 is not computing this correctly + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.9.0': + for a in [2 ** 55, -2 ** 55, 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + self.assertEqual(result, a) + self.assertEqual(result, np_result) + self.assertTrue(result.dtype == np.float64) + + # check returned dtype + for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + s = Series(range(10), dtype=dtype) + result = s.mean() + if is_integer_dtype(dtype): + self.assertTrue(result.dtype == np.float64) + else: + self.assertTrue(result.dtype == dtype) + def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, allow_complex=False, allow_str=False, allow_date=False, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index cab668b3118fd..57fd465993e14 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -404,6 +404,8 @@ def test_abs(self): expected = np.abs(s) assert_series_equal(result, expected) assert_series_equal(result2, expected) + self.assertEqual(result.name, 'A') + self.assertEqual(result2.name, 'A') class CheckIndexing(object): @@ -509,7 +511,9 @@ def test_major_xs(self): idx = self.panel.major_axis[5] xs = self.panel.major_xs(idx) - assert_series_equal(xs['ItemA'], ref.xs(idx)) + result = xs['ItemA'] + assert_series_equal(result, ref.xs(idx), check_names=False) + self.assertEqual(result.name, 'ItemA') # not contained idx = self.panel.major_axis[0] - bday @@ -527,7 +531,7 @@ def test_minor_xs(self): idx = self.panel.minor_axis[1] xs = self.panel.minor_xs(idx) - assert_series_equal(xs['ItemA'], ref[idx]) + assert_series_equal(xs['ItemA'], ref[idx], check_names=False) # not contained self.assertRaises(Exception, self.panel.minor_xs, 'E') @@ -658,7 +662,7 @@ def test_ix_setitem_slice_dataframe(self): def test_ix_align(self): from pandas import Series - b = Series(np.random.randn(10)) + b = Series(np.random.randn(10), name=0) b.sort() df_orig = Panel(np.random.randn(3, 10, 2)) df = df_orig.copy() @@ -960,6 +964,12 @@ def _check_dtype(panel, dtype): panel = Panel(np.random.randn(2,10,5),items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) _check_dtype(panel,dtype) + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + df1 = DataFrame(np.random.randn(2, 5), index=lrange(2), columns=lrange(5)) + df2 = DataFrame(np.random.randn(2, 5), index=lrange(2), columns=lrange(5)) + panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype) + _check_dtype(panel, dtype) + def test_constructor_fails_with_not_3d_input(self): with tm.assertRaisesRegexp(ValueError, "The number of dimensions required is 3"): @@ -1696,22 +1706,23 @@ def test_shift(self): # major idx = self.panel.major_axis[0] idx_lag = self.panel.major_axis[1] - shifted = self.panel.shift(1) - assert_frame_equal(self.panel.major_xs(idx), shifted.major_xs(idx_lag)) # minor idx = self.panel.minor_axis[0] idx_lag = self.panel.minor_axis[1] - shifted = self.panel.shift(1, axis='minor') - assert_frame_equal(self.panel.minor_xs(idx), shifted.minor_xs(idx_lag)) - self.assertRaises(Exception, self.panel.shift, 1, axis='items') + # items + idx = self.panel.items[0] + idx_lag = self.panel.items[1] + shifted = self.panel.shift(1, axis='items') + assert_frame_equal(self.panel[idx], + shifted[idx_lag]) # negative numbers, #2164 result = self.panel.shift(-1) @@ -1984,6 +1995,15 @@ def check_drop(drop_val, axis_number, aliases, expected): expected = Panel({"One": df}) check_drop('Two', 0, ['items'], expected) + self.assertRaises(ValueError, panel.drop, 'Three') + + # errors = 'ignore' + dropped = panel.drop('Three', errors='ignore') + assert_panel_equal(dropped, panel) + dropped = panel.drop(['Two', 'Three'], errors='ignore') + expected = Panel({"One": df}) + assert_panel_equal(dropped, expected) + # Major exp_df = DataFrame({"A": [2], "B": [4]}, index=[1]) expected = Panel({"One": exp_df, "Two": exp_df}) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 66f5110830c72..346c9e2598985 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -151,6 +151,8 @@ def test_multiindex(self): class TestGetDummies(tm.TestCase): + sparse = False + def setUp(self): self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) @@ -163,20 +165,20 @@ def test_basic(self): expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}, 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) - assert_frame_equal(get_dummies(s_list), expected) - assert_frame_equal(get_dummies(s_series), expected) + assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) + assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) expected.index = list('ABC') - assert_frame_equal(get_dummies(s_series_index), expected) + assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected) def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index = ['A']) - res_list = get_dummies(just_na_list) - res_series = get_dummies(just_na_series) - res_series_index = get_dummies(just_na_series_index) + res_list = get_dummies(just_na_list, sparse=self.sparse) + res_series = get_dummies(just_na_series, sparse=self.sparse) + res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) @@ -188,12 +190,13 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] - res = get_dummies(s) + res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) - res_na = get_dummies(s, dummy_na=True) + # Sparse dataframes do not allow nan labelled columns, see #GH8822 + res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1) @@ -201,7 +204,7 @@ def test_include_na(self): exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True) + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan]) assert_array_equal(res_just_na.values, exp_just_na.values) @@ -210,21 +213,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter') + res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) @@ -235,7 +238,7 @@ def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1]}) @@ -243,10 +246,10 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_c']] assert_frame_equal(result, expected) - def test_datafrmae_dummies_prefix_str(self): + def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df - result = get_dummies(df, prefix='bad') + result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1., 0., 1., 0.], [2, 0., 1., 1., 0.], [3, 1., 0., 0., 1.]], @@ -256,40 +259,40 @@ def test_datafrmae_dummies_prefix_str(self): def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], - columns=['A']) + columns=['A'], sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df - result = get_dummies(df, prefix_sep='..') + result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], 'B..c': [0., 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__']) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix=['too few']) + get_dummies(self.df, prefix=['too few'], sparse=self.sparse) def test_dataframe_dummies_prefix_sep_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix_sep=['bad']) + get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse) def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], 'C': [1, 2, 3]}) @@ -298,7 +301,7 @@ def test_dataframe_dummies_prefix_dict(self): def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True) + result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) @@ -306,14 +309,14 @@ def test_dataframe_dummies_with_na(self): 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False) + result = get_dummies(df, dummy_na=False, sparse=self.sparse) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], @@ -322,6 +325,11 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x', 'cat_y']] assert_frame_equal(result, expected) + +class TestGetDummiesSparse(TestGetDummies): + sparse = True + + class TestLreshape(tm.TestCase): def test_pairs(self): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c021bb1bf2fd6..bbe942e607faf 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -83,7 +83,7 @@ def test_dt_namespace_accessor(self): ok_for_period = ok_for_base + ['qyear'] ok_for_dt = ok_for_base + ['date','time','microsecond','nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'tz'] - ok_for_dt_methods = ['to_period','to_pydatetime','tz_localize','tz_convert'] + ok_for_dt_methods = ['to_period','to_pydatetime','tz_localize','tz_convert', 'normalize'] ok_for_td = ['days','seconds','microseconds','nanoseconds'] ok_for_td_methods = ['components','to_pytimedelta'] @@ -165,6 +165,7 @@ def compare(s, name): tm.assert_series_equal(s.dt.year,Series(np.array([2014,2014,2014],dtype='int64'),index=index)) tm.assert_series_equal(s.dt.month,Series(np.array([2,2,2],dtype='int64'),index=index)) tm.assert_series_equal(s.dt.second,Series(np.array([0,1,2],dtype='int64'),index=index)) + tm.assert_series_equal(s.dt.normalize(), pd.Series([s[0]] * 3, index=index)) # periodindex for s in [Series(period_range('20130101',periods=5,freq='D'))]: @@ -242,11 +243,32 @@ def test_dt_accessor_api(self): s.dt self.assertFalse(hasattr(s, 'dt')) - def test_binop_maybe_preserve_name(self): + def test_tab_completion(self): + # GH 9910 + s = Series(list('abcd')) + # Series of str values should have .str but not .dt/.cat in __dir__ + self.assertTrue('str' in dir(s)) + self.assertTrue('dt' not in dir(s)) + self.assertTrue('cat' not in dir(s)) + + # similiarly for .dt + s = Series(date_range('1/1/2015', periods=5)) + self.assertTrue('dt' in dir(s)) + self.assertTrue('str' not in dir(s)) + self.assertTrue('cat' not in dir(s)) + + # similiarly for .cat + s = Series(list('abbcd'), dtype="category") + self.assertTrue('cat' in dir(s)) + self.assertTrue('str' not in dir(s)) + self.assertTrue('dt' not in dir(s)) + def test_binop_maybe_preserve_name(self): # names match, preserve result = self.ts * self.ts self.assertEqual(result.name, self.ts.name) + result = self.ts.mul(self.ts) + self.assertEqual(result.name, self.ts.name) result = self.ts * self.ts[:-2] self.assertEqual(result.name, self.ts.name) @@ -256,6 +278,22 @@ def test_binop_maybe_preserve_name(self): cp.name = 'something else' result = self.ts + cp self.assertIsNone(result.name) + result = self.ts.add(cp) + self.assertIsNone(result.name) + + ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow'] + ops = ops + ['r' + op for op in ops] + for op in ops: + # names match, preserve + s = self.ts.copy() + result = getattr(s, op)(s) + self.assertEqual(result.name, self.ts.name) + + # names don't match, don't preserve + cp = self.ts.copy() + cp.name = 'changed' + result = getattr(s, op)(cp) + self.assertIsNone(result.name) def test_combine_first_name(self): result = self.ts.combine_first(self.ts[:5]) @@ -1859,6 +1897,48 @@ def test_where_dups(self): expected = Series([5,11,2,5,11,2],index=[0,1,2,0,1,2]) assert_series_equal(comb, expected) + def test_where_datetime(self): + s = Series(date_range('20130102', periods=2)) + expected = Series([10, 10], dtype='datetime64[ns]') + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype='datetime64[ns]') + assert_series_equal(rs, expected) + + def test_where_timedelta(self): + s = Series([1, 2], dtype='timedelta64[ns]') + expected = Series([10, 10], dtype='timedelta64[ns]') + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype='timedelta64[ns]') + assert_series_equal(rs, expected) + def test_mask(self): # compare with tested results in test_where s = Series(np.random.randn(5)) @@ -1954,6 +2034,14 @@ def test_drop(self): self.assertRaises(ValueError, s.drop, 'bc') self.assertRaises(ValueError, s.drop, ('a',)) + # errors='ignore' + s = Series(range(3),index=list('abc')) + result = s.drop('bc', errors='ignore') + assert_series_equal(result, s) + result = s.drop(['a', 'd'], errors='ignore') + expected = s.ix[1:] + assert_series_equal(result, expected) + # bad axis self.assertRaises(ValueError, s.drop, 'one', axis='columns') @@ -3583,6 +3671,16 @@ def test_fillna(self): expected = Series([999,999,np.nan],index=[0,1,2]) assert_series_equal(result,expected) + # GH 9043 + # make sure a string representation of int/float values can be filled + # correctly without raising errors or being converted + vals = ['0', '1.5', '-0.3'] + for val in vals: + s = Series([0, 1, np.nan, np.nan, 4], dtype='float64') + result = s.fillna(val) + expected = Series([0, 1, val, val, 4], dtype='object') + assert_series_equal(result, expected) + def test_fillna_bug(self): x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) filled = x.fillna(method='ffill') @@ -4925,6 +5023,19 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) self.assertIsInstance(csv_str, str) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + s = Series([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Series([getattr(str, method)(x) for x in s.values]) + assert_series_equal(getattr(Series.str, method)(s.str), expected) + + # str accessor only valid with string values + s = Series(range(5)) + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + s.str.repeat(2) + def test_clip(self): val = self.ts.median() @@ -4954,6 +5065,20 @@ def test_clip_types_and_nulls(self): self.assertEqual(list(isnull(s)), list(isnull(l))) self.assertEqual(list(isnull(s)), list(isnull(u))) + def test_clip_against_series(self): + # GH #6966 + + s = Series([1.0, 1.0, 4.0]) + threshold = Series([1.0, 2.0, 3.0]) + + assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) + assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) + + lower = Series([1.0, 2.0, 3.0]) + upper = Series([1.5, 2.5, 3.5]) + assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) + assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) + def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN @@ -5290,7 +5415,8 @@ def test_getitem_setitem_datetime_tz_pytz(self): def test_getitem_setitem_datetime_tz_dateutil(self): tm._skip_if_no_dateutil(); from dateutil.tz import tzutc - from dateutil.zoneinfo import gettz + from pandas.tslib import _dateutil_gettz as gettz + tz = lambda x: tzutc() if x == 'UTC' else gettz(x) # handle special case for utc in dateutil from pandas import date_range @@ -5503,6 +5629,24 @@ def test_astype_str(self): expec = s.map(compat.text_type) assert_series_equal(res, expec) + # GH9757 + # Test str and unicode on python 2.x and just str on python 3.x + for tt in set([str, compat.text_type]): + ts = Series([Timestamp('2010-01-04 00:00:00')]) + s = ts.astype(tt) + expected = Series([tt(ts.values[0])]) + assert_series_equal(s, expected) + + ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) + s = ts.astype(tt) + expected = Series([tt(ts.values[0])]) + assert_series_equal(s, expected) + + td = Series([Timedelta(1, unit='d')]) + s = td.astype(tt) + expected = Series([tt(td.values[0])]) + assert_series_equal(s, expected) + def test_astype_unicode(self): # GH7758 @@ -5578,6 +5722,22 @@ def test_map_type_inference(self): s2 = s.map(lambda x: np.where(x == 0, 0, 1)) self.assertTrue(issubclass(s2.dtype.type, np.integer)) + def test_divide_decimal(self): + ''' resolves issue #9787 ''' + from decimal import Decimal + + expected = Series([Decimal(5)]) + + s = Series([Decimal(10)]) + s = s/Decimal(2) + + tm.assert_series_equal(expected, s) + + s = Series([Decimal(10)]) + s = s//Decimal(2) + + tm.assert_series_equal(expected, s) + def test_map_decimal(self): from decimal import Decimal @@ -5789,6 +5949,10 @@ def _check_align(a, b, how='left', fill=None): assert_series_equal(aa, ea) assert_series_equal(ab, eb) + self.assertEqual(aa.name, 'ts') + self.assertEqual(ea.name, 'ts') + self.assertEqual(ab.name, 'ts') + self.assertEqual(eb.name, 'ts') for kind in JOIN_TYPES: _check_align(self.ts[2:], self.ts[:-5], how=kind) @@ -5796,12 +5960,15 @@ def _check_align(a, b, how='left', fill=None): # empty left _check_align(self.ts[:0], self.ts[:-5], how=kind) + _check_align(self.ts[:0], self.ts[:-5], how=kind, fill=-1) # empty right _check_align(self.ts[:-5], self.ts[:0], how=kind) + _check_align(self.ts[:-5], self.ts[:0], how=kind, fill=-1) # both empty _check_align(self.ts[:0], self.ts[:0], how=kind) + _check_align(self.ts[:0], self.ts[:0], how=kind, fill=-1) def test_align_fill_method(self): def _check_align(a, b, how='left', method='pad', limit=None): @@ -6754,6 +6921,22 @@ def test_searchsorted_sorter(self): e = np.array([0, 2]) tm.assert_array_equal(r, e) + def test_to_frame_expanddim(self): + # GH 9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + s = SubclassedSeries([1, 2, 3], name='X') + result = s.to_frame() + self.assertTrue(isinstance(result, SubclassedFrame)) + expected = SubclassedFrame({'X': [1, 2, 3]}) + assert_frame_equal(result, expected) class TestSeriesNonUnique(tm.TestCase): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9283be566bd8f..b0d8d89d65cf2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -516,7 +516,6 @@ def test_match(self): def test_extract(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) er = [NA, NA] # empty row @@ -540,15 +539,31 @@ def test_extract(self): exp = DataFrame([[u('BAD__'), u('BAD')], er, er]) tm.assert_frame_equal(result, exp) - # no groups - s = Series(['A1', 'B2', 'C3']) - f = lambda: s.str.extract('[ABC][123]') - self.assertRaises(ValueError, f) - - # only non-capturing groups - f = lambda: s.str.extract('(?:[AB]).*') - self.assertRaises(ValueError, f) + # GH9980 + # Index only works with one regex group since + # multi-group would expand to a frame + idx = Index(['A1', 'A2', 'A3', 'A4', 'B5']) + with tm.assertRaisesRegexp(ValueError, "supported"): + idx.str.extract('([AB])([123])') + + # these should work for both Series and Index + for klass in [Series, Index]: + # no groups + s_or_idx = klass(['A1', 'B2', 'C3']) + f = lambda: s_or_idx.str.extract('[ABC][123]') + self.assertRaises(ValueError, f) + + # only non-capturing groups + f = lambda: s_or_idx.str.extract('(?:[AB]).*') + self.assertRaises(ValueError, f) + + # single group renames series/index properly + s_or_idx = klass(['A1', 'A2']) + result = s_or_idx.str.extract(r'(?PA)\d') + tm.assert_equal(result.name, 'uno') + tm.assert_array_equal(result, klass(['A', 'A'])) + s = Series(['A1', 'B2', 'C3']) # one group, no matches result = s.str.extract('(_)') exp = Series([NA, NA, NA], dtype=object) @@ -569,14 +584,16 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) - # named group/groups - result = s.str.extract('(?P[AB])(?P[123])') - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) - tm.assert_frame_equal(result, exp) + # one named group result = s.str.extract('(?P[AB])') exp = Series(['A', 'B', NA], name='letter') tm.assert_series_equal(result, exp) + # two named groups + result = s.str.extract('(?P[AB])(?P[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) + tm.assert_frame_equal(result, exp) + # mix named and unnamed groups result = s.str.extract('([AB])(?P[123])') exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number']) @@ -602,11 +619,6 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) - # single group renames series properly - s = Series(['A1', 'A2']) - result = s.str.extract(r'(?PA)\d') - tm.assert_equal(result.name, 'uno') - # GH6348 # not passing index to the extractor def check_index(index): @@ -664,6 +676,8 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_list, empty.str.split('a')) + tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False)) + tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) @@ -685,6 +699,19 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) + tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) + if compat.PY3: + table = str.maketrans('a', 'b') + else: + import string + table = string.maketrans('a', 'b') + tm.assert_series_equal(empty_str, empty.str.translate(table)) + + def test_empty_str_methods_to_frame(self): + empty_str = empty = Series(dtype=str) + empty_df = DataFrame([]) + tm.assert_frame_equal(empty_df, empty.str.partition('a')) + tm.assert_frame_equal(empty_df, empty.str.rpartition('a')) def test_ismethods(self): values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] @@ -752,6 +779,12 @@ def test_get_dummies(self): columns=list('7ab')) tm.assert_frame_equal(result, expected) + # GH9980 + # Index.str does not support get_dummies() as it returns a frame + with tm.assertRaisesRegexp(TypeError, "not supported"): + idx = Index(['a|b', 'a|c', 'b|c']) + idx.str.get_dummies('|') + def test_join(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.join('_') @@ -881,6 +914,53 @@ def test_find_nan(self): result = values.str.rfind('EF', 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) + def test_index(self): + for klass in [Series, Index]: + s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) + + result = s.str.index('EF') + tm.assert_array_equal(result, klass([4, 3, 1, 0])) + expected = np.array([v.index('EF') for v in s.values]) + tm.assert_array_equal(result.values, expected) + + result = s.str.rindex('EF') + tm.assert_array_equal(result, klass([4, 5, 7, 4])) + expected = np.array([v.rindex('EF') for v in s.values]) + tm.assert_array_equal(result.values, expected) + + result = s.str.index('EF', 3) + tm.assert_array_equal(result, klass([4, 3, 7, 4])) + expected = np.array([v.index('EF', 3) for v in s.values]) + tm.assert_array_equal(result.values, expected) + + result = s.str.rindex('EF', 3) + tm.assert_array_equal(result, klass([4, 5, 7, 4])) + expected = np.array([v.rindex('EF', 3) for v in s.values]) + tm.assert_array_equal(result.values, expected) + + result = s.str.index('E', 4, 8) + tm.assert_array_equal(result, klass([4, 5, 7, 4])) + expected = np.array([v.index('E', 4, 8) for v in s.values]) + tm.assert_array_equal(result.values, expected) + + result = s.str.rindex('E', 0, 5) + tm.assert_array_equal(result, klass([4, 3, 1, 4])) + expected = np.array([v.rindex('E', 0, 5) for v in s.values]) + tm.assert_array_equal(result.values, expected) + + with tm.assertRaisesRegexp(ValueError, "substring not found"): + result = s.str.index('DE') + + with tm.assertRaisesRegexp(TypeError, "expected a string object, not int"): + result = s.str.index(0) + + # test with nan + s = Series(['abcb', 'ab', 'bcbe', np.nan]) + result = s.str.index('b') + tm.assert_array_equal(result, Series([1, 1, 0, np.nan])) + result = s.str.rindex('b') + tm.assert_array_equal(result, Series([3, 1, 2, np.nan])) + def test_pad(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -965,6 +1045,37 @@ def test_pad_fillchar(self): with tm.assertRaisesRegexp(TypeError, "fillchar must be a character, not int"): result = values.str.pad(5, fillchar=5) + def test_translate(self): + for klass in [Series, Index]: + s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) + if not compat.PY3: + import string + table = string.maketrans('abc', 'cde') + else: + table = str.maketrans('abc', 'cde') + result = s.str.translate(table) + expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) + tm.assert_array_equal(result, expected) + + # use of deletechars is python 2 only + if not compat.PY3: + result = s.str.translate(table, deletechars='fg') + expected = klass(['cdede', 'cdee', 'eddd', 'ede']) + tm.assert_array_equal(result, expected) + + result = s.str.translate(None, deletechars='fg') + expected = klass(['abcde', 'abcc', 'cddd', 'cde']) + tm.assert_array_equal(result, expected) + else: + with tm.assertRaisesRegexp(ValueError, "deletechars is not a valid argument"): + result = s.str.translate(table, deletechars='fg') + + # Series with non-string values + s = Series(['a', 'b', 'c', 1.2]) + expected = Series(['c', 'd', 'e', np.nan]) + result = s.str.translate(table) + tm.assert_array_equal(result, expected) + def test_center_ljust_rjust(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -1095,14 +1206,19 @@ def test_split(self): result = values.str.split('__') tm.assert_series_equal(result, exp) + result = values.str.split('__', expand=False) + tm.assert_series_equal(result, exp) + # mixed mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, 2.]) - - rs = Series(mixed).str.split('_') + rs = mixed.str.split('_') xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA]) + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + rs = mixed.str.split('_', expand=False) tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1115,6 +1231,9 @@ def test_split(self): [u('f'), u('g'), u('h')]]) tm.assert_series_equal(result, exp) + result = values.str.split('_', expand=False) + tm.assert_series_equal(result, exp) + def test_split_noargs(self): # #1859 s = Series(['Wes McKinney', 'Travis Oliphant']) @@ -1148,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self): def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) - result = s.str.split('_', return_type='frame') + + with tm.assert_produces_warning(): + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) @@ -1171,9 +1293,174 @@ def test_split_to_dataframe(self): index=['preserve', 'me']) tm.assert_frame_equal(result, exp) - with tm.assertRaisesRegexp(ValueError, "return_type must be"): + with tm.assertRaisesRegexp(ValueError, "expand must be"): s.str.split('_', return_type="some_invalid_type") + def test_split_to_dataframe_expand(self): + s = Series(['nosplit', 'alsonosplit']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_equal_splits', 'with_no_nans']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], + 2: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'], + 2: ['splits', 'these'], 3: [NA, 'things'], + 4: [NA, 'is'], 5: [NA, 'not']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, + index=['preserve', 'me']) + tm.assert_frame_equal(result, exp) + + with tm.assertRaisesRegexp(ValueError, "expand must be"): + s.str.split('_', return_type="some_invalid_type") + + def test_split_to_multiindex_expand(self): + idx = Index(['nosplit', 'alsonosplit']) + result = idx.str.split('_', expand=True) + exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + idx = Index(['some_equal_splits', 'with_no_nans']) + result = idx.str.split('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), + ('with', 'no', 'nans')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 3) + + idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) + result = idx.str.split('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA), + ('one', 'of', 'these', 'things', 'is', 'not')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 6) + + with tm.assertRaisesRegexp(ValueError, "expand must be"): + idx.str.split('_', return_type="some_invalid_type") + + def test_partition_series(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + + result = values.str.partition('_', expand=False) + exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + result = values.str.partition('__', expand=False) + exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('__', expand=False) + exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']]) + tm.assert_series_equal(result, exp) + + # None + values = Series(['a b c', 'c d e', NA, 'f g h']) + result = values.str.partition(expand=False) + exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition(expand=False) + exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']]) + tm.assert_series_equal(result, exp) + + # Not splited + values = Series(['abc', 'cde', NA, 'fgh']) + result = values.str.partition('_', expand=False) + exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']]) + tm.assert_series_equal(result, exp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + + result = values.str.partition('_', expand=False) + exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')], + NA, [u('f'), u('_'), u('g_h')]]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')], + NA, [u('f_g'), u('_'), u('h')]]) + tm.assert_series_equal(result, exp) + + # compare to standard lib + values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF']) + result = values.str.partition('_', expand=False).tolist() + self.assertEqual(result, [v.partition('_') for v in values]) + result = values.str.rpartition('_', expand=False).tolist() + self.assertEqual(result, [v.rpartition('_') for v in values]) + + def test_partition_index(self): + values = Index(['a_b_c', 'c_d_e', 'f_g_h']) + + result = values.str.partition('_', expand=False) + exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + result = values.str.rpartition('_', expand=False) + exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + result = values.str.partition('_') + exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]) + tm.assert_index_equal(result, exp) + self.assertTrue(isinstance(result, MultiIndex)) + self.assertEqual(result.nlevels, 3) + + result = values.str.rpartition('_') + exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]) + tm.assert_index_equal(result, exp) + self.assertTrue(isinstance(result, MultiIndex)) + self.assertEqual(result.nlevels, 3) + + def test_partition_to_dataframe(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.partition('_') + exp = DataFrame({0: ['a', 'c', np.nan, 'f'], + 1: ['_', '_', np.nan, '_'], + 2: ['b_c', 'd_e', np.nan, 'g_h']}) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition('_') + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], + 1: ['_', '_', np.nan, '_'], + 2: ['c', 'e', np.nan, 'h']}) + tm.assert_frame_equal(result, exp) + + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.partition('_', expand=True) + exp = DataFrame({0: ['a', 'c', np.nan, 'f'], + 1: ['_', '_', np.nan, '_'], + 2: ['b_c', 'd_e', np.nan, 'g_h']}) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition('_', expand=True) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], + 1: ['_', '_', np.nan, '_'], + 2: ['c', 'e', np.nan, 'h']}) + tm.assert_frame_equal(result, exp) + def test_pipe_failures(self): # #2119 s = Series(['A|B|C']) @@ -1549,6 +1836,51 @@ def test_encode_decode_errors(self): tm.assert_series_equal(result, exp) + def test_normalize(self): + def unistr(codes): + # build unicode string from unichr + # we cannot use six.u() here because it escapes unicode + return ''.join([unichr(c) for c in codes]) + + values = ['ABC', # ASCII + unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + np.nan, + unistr([0xFF71, 0xFF72, 0xFF74])] # アイエ + s = Series(values, index=['a', 'b', 'c', 'd', 'e']) + + normed = [compat.u_safe('ABC'), + compat.u_safe('ABC'), + compat.u_safe('123'), + np.nan, + unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ + expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) + + result = s.str.normalize('NFKC') + tm.assert_series_equal(result, expected) + + expected = Series([compat.u_safe('ABC'), + unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + np.nan, + unistr([0xFF71, 0xFF72, 0xFF74])], # アイエ + index=['a', 'b', 'c', 'd', 'e']) + + result = s.str.normalize('NFC') + tm.assert_series_equal(result, expected) + + with tm.assertRaisesRegexp(ValueError, "invalid normalization form"): + s.str.normalize('xxx') + + s = Index([unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + unistr([0xFF71, 0xFF72, 0xFF74])]) # アイエ + expected = Index([compat.u_safe('ABC'), + compat.u_safe('123'), + unistr([0x30A2, 0x30A4, 0x30A8])]) + result = s.str.normalize('NFKC') + tm.assert_index_equal(result, expected) + def test_cat_on_filtered_index(self): df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]], names=['year', 'month'])) @@ -1567,6 +1899,68 @@ def test_cat_on_filtered_index(self): self.assertEqual(str_multiple.loc[1], '2011 2 2') + def test_index_str_accessor_visibility(self): + from pandas.core.strings import StringMethods + + if not compat.PY3: + cases = [(['a', 'b'], 'string'), + (['a', u('b')], 'mixed'), + ([u('a'), u('b')], 'unicode'), + (['a', 'b', 1], 'mixed-integer'), + (['a', 'b', 1.3], 'mixed'), + (['a', 'b', 1.3, 1], 'mixed-integer'), + (['aa', datetime(2011, 1, 1)], 'mixed')] + else: + cases = [(['a', 'b'], 'string'), + (['a', u('b')], 'string'), + ([u('a'), u('b')], 'string'), + (['a', 'b', 1], 'mixed-integer'), + (['a', 'b', 1.3], 'mixed'), + (['a', 'b', 1.3, 1], 'mixed-integer'), + (['aa', datetime(2011, 1, 1)], 'mixed')] + for values, tp in cases: + idx = Index(values) + self.assertTrue(isinstance(Series(values).str, StringMethods)) + self.assertTrue(isinstance(idx.str, StringMethods)) + self.assertEqual(idx.inferred_type, tp) + + for values, tp in cases: + idx = Index(values) + self.assertTrue(isinstance(Series(values).str, StringMethods)) + self.assertTrue(isinstance(idx.str, StringMethods)) + self.assertEqual(idx.inferred_type, tp) + + cases = [([1, np.nan], 'floating'), + ([datetime(2011, 1, 1)], 'datetime64'), + ([timedelta(1)], 'timedelta64')] + for values, tp in cases: + idx = Index(values) + message = 'Can only use .str accessor with string values' + with self.assertRaisesRegexp(AttributeError, message): + Series(values).str + with self.assertRaisesRegexp(AttributeError, message): + idx.str + self.assertEqual(idx.inferred_type, tp) + + # MultiIndex has mixed dtype, but not allow to use accessor + idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) + self.assertEqual(idx.inferred_type, 'mixed') + message = 'Can only use .str accessor with Index, not MultiIndex' + with self.assertRaisesRegexp(AttributeError, message): + idx.str + + def test_method_on_bytes(self): + lhs = Series(np.array(list('abc'), 'S1').astype(object)) + rhs = Series(np.array(list('def'), 'S1').astype(object)) + if compat.PY3: + self.assertRaises(TypeError, lhs.str.cat, rhs) + else: + result = lhs.str.cat(rhs) + expected = Series(np.array(['ad', 'be', 'cf'], + 'S2').astype(object)) + tm.assert_series_equal(result, expected) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 642e50c37874d..cc0a0ea5662db 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -215,6 +215,14 @@ def test_multiindex_dtype(self): {'a':[1.0,2.0],'b':[2.1,1.5],'c':['l1','l2']}, index=['a','b']) self._assert_not_equal(df1, df2, check_index_type=True) + def test_empty_dtypes(self): + df1=pd.DataFrame(columns=["col1","col2"]) + df1["col1"] = df1["col1"].astype('int64') + df2=pd.DataFrame(columns=["col1","col2"]) + self._assert_equal(df1, df2, check_dtype=False) + self._assert_not_equal(df1, df2, check_dtype=True) + + class TestRNGContext(unittest.TestCase): def test_RNGContext(self): diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 1b796ed2d83d1..035b3ac07342d 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -9,6 +9,8 @@ import pandas.lib as lib import pandas._period as period import pandas.algos as algos +from pandas.tseries.holiday import Holiday, SA, next_monday +from pandas import DateOffset class TestTseriesUtil(tm.TestCase): @@ -737,6 +739,17 @@ def test_get_period_field_raises_on_out_of_range(self): def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, period.get_period_field_arr, -1, np.empty(1), 0) + +class TestHolidayConflictingArguments(tm.TestCase): + + # GH 10217 + + def test_both_offset_observance_raises(self): + + with self.assertRaises(NotImplementedError) as cm: + h = Holiday("Cyber Monday", month=11, day=1, + offset=[DateOffset(weekday=SA(4))], observance=next_monday) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index 2e22b33dc769a..38f058358b37f 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -3,6 +3,7 @@ import nose +import sys import pandas.util from pandas.util.decorators import deprecate_kwarg import pandas.util.testing as tm @@ -79,6 +80,13 @@ def test_warning(self): with tm.assert_produces_warning(FutureWarning): self.assertNotAlmostEquals(1, 2) + def test_locale(self): + if sys.platform == 'win32': + raise nose.SkipTest("skipping on win platforms as locale not available") + + #GH9744 + locales = pandas.util.testing.get_locales() + self.assertTrue(len(locales) >= 1) def test_rands(): r = tm.rands(10) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 0be030d7c2c8e..76685e2589012 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -12,6 +12,7 @@ from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.core.common as com +from pandas.core.common import AbstractMethodError from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na @@ -131,7 +132,7 @@ def random_color(column): colors = lmap(random_color, lrange(num_colors)) else: - raise NotImplementedError + raise ValueError("color_type must be either 'default' or 'random'") if len(colors) != num_colors: multiple = num_colors//len(colors) - 1 @@ -809,7 +810,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else True + grid = False if secondary_y else self.plt.rcParams['axes.grid'] self.grid = grid self.legend = legend @@ -867,12 +868,17 @@ def _validate_color_args(self): "simultaneously. Using 'color'") if 'color' in self.kwds and self.style is not None: + if com.is_list_like(self.style): + styles = self.style + else: + styles = [self.style] # need only a single match - if re.match('^[a-z]+?', self.style) is not None: - raise ValueError("Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol") + for s in styles: + if re.match('^[a-z]+?', s) is not None: + raise ValueError("Cannot pass 'style' string with a color " + "symbol and 'color' keyword argument. Please" + " use one or the other or pass 'style' " + "without a color symbol") def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -880,28 +886,16 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - from pandas.core.frame import DataFrame - if isinstance(data, (Series, np.ndarray, Index)): - label = self.label if self.label is not None else data.name + if self.sort_columns: + columns = com._try_sort(data.columns) + else: + columns = data.columns + + for col in columns: if keep_index is True: - yield label, data + yield col, data[col] else: - yield label, np.asarray(data) - elif isinstance(data, DataFrame): - if self.sort_columns: - columns = com._try_sort(data.columns) - else: - columns = data.columns - - for col in columns: - # # is this right? - # empty = df[col].count() == 0 - # values = df[col].values if not empty else np.zeros(len(df)) - - if keep_index is True: - yield col, data[col] - else: - yield col, data[col].values + yield col, data[col].values @property def nseries(self): @@ -934,19 +928,21 @@ def _has_plotted_object(self, ax): def _maybe_right_yaxis(self, ax, axes_num): if not self.on_right(axes_num): - if hasattr(ax, 'left_ax'): - # secondary axes may be passed as axes - return ax.left_ax - return ax + # secondary axes may be passed via ax kw + return self._get_ax_layer(ax) if hasattr(ax, 'right_ax'): + # if it has right_ax proparty, ``ax`` must be left axes return ax.right_ax + elif hasattr(ax, 'left_ax'): + # if it has left_ax proparty, ``ax`` must be right axes + return ax else: + # otherwise, create twin axes orig_ax, new_ax = ax, ax.twinx() new_ax._get_lines.color_cycle = orig_ax._get_lines.color_cycle orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax - new_ax.right_ax = new_ax if not self._has_plotted_object(orig_ax): # no data on left y orig_ax.get_yaxis().set_visible(False) @@ -994,14 +990,21 @@ def result(self): all_sec = (com.is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries) if (sec_true or all_sec): - # if all data is plotted on secondary, - # return secondary axes - return self.axes[0].right_ax + # if all data is plotted on secondary, return right axes + return self._get_ax_layer(self.axes[0], primary=False) else: return self.axes[0] def _compute_plot_data(self): - numeric_data = self.data.convert_objects()._get_numeric_data() + data = self.data + + if isinstance(data, Series): + label = self.label + if label is None and data.name is None: + label = 'None' + data = data.to_frame(name=label) + + numeric_data = data.convert_objects()._get_numeric_data() try: is_empty = numeric_data.empty @@ -1016,18 +1019,13 @@ def _compute_plot_data(self): self.data = numeric_data def _make_plot(self): - raise NotImplementedError + raise AbstractMethodError(self) def _add_table(self): if self.table is False: return elif self.table is True: - from pandas.core.frame import DataFrame - if isinstance(self.data, Series): - data = DataFrame(self.data, columns=[self.data.name]) - elif isinstance(self.data, DataFrame): - data = self.data - data = data.transpose() + data = self.data.transpose() else: data = self.table ax = self._get_ax(0) @@ -1042,7 +1040,10 @@ def _adorn_subplots(self): if len(self.axes) > 0: all_axes = self._get_axes() nrows, ncols = self._get_axes_layout() - _handle_shared_axes(all_axes, len(all_axes), len(all_axes), nrows, ncols, self.sharex, self.sharey) + _handle_shared_axes(axarr=all_axes, nplots=len(all_axes), + naxes=nrows * ncols, nrows=nrows, + ncols=ncols, sharex=self.sharex, + sharey=self.sharey) for ax in to_adorn: if self.yticks is not None: @@ -1094,18 +1095,15 @@ def _apply_axis_properties(self, axis, rot=None, fontsize=None): @property def legend_title(self): - if hasattr(self.data, 'columns'): - if not isinstance(self.data.columns, MultiIndex): - name = self.data.columns.name - if name is not None: - name = com.pprint_thing(name) - return name - else: - stringified = map(com.pprint_thing, - self.data.columns.names) - return ','.join(stringified) + if not isinstance(self.data.columns, MultiIndex): + name = self.data.columns.name + if name is not None: + name = com.pprint_thing(name) + return name else: - return None + stringified = map(com.pprint_thing, + self.data.columns.names) + return ','.join(stringified) def _add_legend_handle(self, handle, label, index=None): if not label is None: @@ -1236,11 +1234,18 @@ def _get_index_name(self): return name + @classmethod + def _get_ax_layer(cls, ax, primary=True): + """get left (primary) or right (secondary) axes""" + if primary: + return getattr(ax, 'left_ax', ax) + else: + return getattr(ax, 'right_ax', ax) + def _get_ax(self, i): # get the twinx ax if appropriate if self.subplots: ax = self.axes[i] - ax = self._maybe_right_yaxis(ax, i) self.axes[i] = ax else: @@ -1251,12 +1256,10 @@ def _get_ax(self, i): return ax def on_right(self, i): - from pandas.core.frame import DataFrame if isinstance(self.secondary_y, bool): return self.secondary_y - if (isinstance(self.data, DataFrame) and - isinstance(self.secondary_y, (tuple, list, np.ndarray, Index))): + if isinstance(self.secondary_y, (tuple, list, np.ndarray, Index)): return self.data.columns[i] in self.secondary_y def _get_style(self, i, col_name): @@ -1548,16 +1551,14 @@ def __init__(self, data, **kwargs): self.x_compat = bool(self.kwds.pop('x_compat')) def _index_freq(self): - from pandas.core.frame import DataFrame - if isinstance(self.data, (Series, DataFrame)): - freq = getattr(self.data.index, 'freq', None) - if freq is None: - freq = getattr(self.data.index, 'inferred_freq', None) - if freq == 'B': - weekdays = np.unique(self.data.index.dayofweek) - if (5 in weekdays) or (6 in weekdays): - freq = None - return freq + freq = getattr(self.data.index, 'freq', None) + if freq is None: + freq = getattr(self.data.index, 'inferred_freq', None) + if freq == 'B': + weekdays = np.unique(self.data.index.dayofweek) + if (5 in weekdays) or (6 in weekdays): + freq = None + return freq def _is_dynamic_freq(self, freq): if isinstance(freq, DateOffset): @@ -1569,9 +1570,7 @@ def _is_dynamic_freq(self, freq): def _no_base(self, freq): # hack this for 0.10.1, creating more technical debt...sigh - from pandas.core.frame import DataFrame - if (isinstance(self.data, (Series, DataFrame)) - and isinstance(self.data.index, DatetimeIndex)): + if isinstance(self.data.index, DatetimeIndex): base = frequencies.get_freq(freq) x = self.data.index if (base <= frequencies.FreqGroup.FR_DAY): @@ -1681,17 +1680,13 @@ def _update_prior(self, y): def _maybe_convert_index(self, data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames - from pandas.core.frame import DataFrame - if (isinstance(data.index, DatetimeIndex) and - isinstance(data, DataFrame)): + if isinstance(data.index, DatetimeIndex): freq = getattr(data.index, 'freq', None) if freq is None: freq = getattr(data.index, 'inferred_freq', None) if isinstance(freq, DateOffset): freq = freq.rule_code - freq = frequencies.get_base_alias(freq) - freq = frequencies.get_period_alias(freq) if freq is None: ax = self._get_ax(0) @@ -1700,9 +1695,10 @@ def _maybe_convert_index(self, data): if freq is None: raise ValueError('Could not get frequency alias for plotting') - data = DataFrame(data.values, - index=data.index.to_period(freq=freq), - columns=data.columns) + freq = frequencies.get_base_alias(freq) + freq = frequencies.get_period_alias(freq) + + data.index = data.index.to_period(freq=freq) return data def _post_plot_logic(self): @@ -1831,21 +1827,19 @@ def _get_plot_function(self): if self.kind == 'bar': def f(ax, x, y, w, start=None, **kwds): start = start + self.bottom - return ax.bar(x, y, w, bottom=start,log=self.log, **kwds) + return ax.bar(x, y, w, bottom=start, log=self.log, **kwds) elif self.kind == 'barh': + def f(ax, x, y, w, start=None, log=self.log, **kwds): start = start + self.left - return ax.barh(x, y, w, left=start, **kwds) + return ax.barh(x, y, w, left=start, log=self.log, **kwds) else: - raise NotImplementedError + raise ValueError("BarPlot kind must be either 'bar' or 'barh'") return f def _make_plot(self): import matplotlib as mpl - # mpl decided to make their version string unicode across all Python - # versions for mpl >= 1.3 so we have to call str here for python 2 - mpl_le_1_2_1 = str(mpl.__version__) <= LooseVersion('1.2.1') colors = self._get_colors() ncolors = len(colors) @@ -1869,11 +1863,8 @@ def _make_plot(self): kwds['ecolor'] = mpl.rcParams['xtick.color'] start = 0 - if self.log: + if self.log and (y >= 1).all(): start = 1 - if any(y < 1): - # GH3254 - start = 0 if mpl_le_1_2_1 else None if self.subplots: w = self.bar_width / 2 @@ -1943,7 +1934,8 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): def _args_adjust(self): if com.is_integer(self.bins): # create common bin edge - values = np.ravel(self.data.values) + values = self.data.convert_objects()._get_numeric_data() + values = np.ravel(values) values = values[~com.isnull(values)] hist, self.bins = np.histogram(values, bins=self.bins, @@ -2515,10 +2507,7 @@ def plot_series(data, kind='line', ax=None, # Series unique """ if ax is None and len(plt.get_fignums()) > 0: ax = _gca() - ax = getattr(ax, 'left_ax', ax) - # is there harm in this? - if label is None: - label = data.name + ax = MPLPlot._get_ax_layer(ax) return _plot(data, kind=kind, ax=ax, figsize=figsize, use_index=use_index, title=title, grid=grid, legend=legend, @@ -3020,7 +3009,7 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, if columns is None: if not isinstance(by, (list, tuple)): by = [by] - columns = data._get_numeric_data().columns - by + columns = data._get_numeric_data().columns.difference(by) naxes = len(columns) fig, axes = _subplots(naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout) @@ -3365,11 +3354,9 @@ def _flatten(axes): def _get_all_lines(ax): lines = ax.get_lines() - # check for right_ax, which can oddly sometimes point back to ax - if hasattr(ax, 'right_ax') and ax.right_ax != ax: + if hasattr(ax, 'right_ax'): lines += ax.right_ax.get_lines() - # no such risk with left_ax if hasattr(ax, 'left_ax'): lines += ax.left_ax.get_lines() diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ed11b12871ce5..88b4117d4807c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -3,21 +3,20 @@ """ import warnings -from datetime import datetime, time, timedelta +from datetime import datetime, timedelta from pandas import compat import numpy as np from pandas.core import common as com -from pandas.core.common import is_integer, is_float +from pandas.core.common import is_integer, is_float, AbstractMethodError import pandas.tslib as tslib import pandas.lib as lib from pandas.core.index import Index from pandas.util.decorators import Appender, cache_readonly -from pandas.tseries.frequencies import ( - infer_freq, to_offset, get_period_alias, - Resolution) +from pandas.tseries.frequencies import infer_freq, to_offset, Resolution import pandas.algos as _algos + class DatetimeIndexOpsMixin(object): """ common ops mixin to support a unified inteface datetimelike Index """ @@ -48,7 +47,7 @@ def _box_func(self): """ box function to get object from internal representation """ - raise NotImplementedError + raise AbstractMethodError(self) def _box_values(self, values): """ @@ -61,13 +60,13 @@ def groupby(self, f): return _algos.groupby_object(objs, f) def _format_with_header(self, header, **kwargs): - return header + self._format_native_types(**kwargs) + return header + list(self._format_native_types(**kwargs)) def __contains__(self, key): try: res = self.get_loc(key) return np.isscalar(res) or type(res) == slice or np.any(res) - except (KeyError, TypeError): + except (KeyError, TypeError, ValueError): return False @property @@ -79,6 +78,11 @@ def freqstr(self): @cache_readonly def inferred_freq(self): + """ + Trys to return a string representing a frequency guess, + generated by infer_freq. Returns None if it can't autodetect the + frequency. + """ try: return infer_freq(self) except ValueError: @@ -255,35 +259,25 @@ def argmax(self, axis=None): @property def _formatter_func(self): - """ - Format function to convert value to representation - """ - return str - - def _format_footer(self): - raise NotImplementedError - - def __unicode__(self): - formatter = self._formatter_func - summary = str(self.__class__) + '\n' - - n = len(self) - if n == 0: - pass - elif n == 1: - first = formatter(self[0]) - summary += '[%s]\n' % first - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary += '[%s, %s]\n' % (first, last) - else: - first = formatter(self[0]) - last = formatter(self[-1]) - summary += '[%s, ..., %s]\n' % (first, last) - - summary += self._format_footer() - return summary + raise AbstractMethodError(self) + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + attrs = super(DatetimeIndexOpsMixin, self)._format_attrs() + for attrib in self._attributes: + if attrib == 'freq': + freq = self.freqstr + if freq is not None: + freq = "'%s'" % freq + attrs.append(('freq',freq)) + elif attrib == 'tz': + tz = self.tz + if tz is not None: + tz = "'%s'" % tz + attrs.append(('tz',tz)) + return attrs @cache_readonly def _resolution(self): @@ -314,10 +308,10 @@ def _convert_scalar_indexer(self, key, kind=None): return super(DatetimeIndexOpsMixin, self)._convert_scalar_indexer(key, kind=kind) def _add_datelike(self, other): - raise NotImplementedError + raise AbstractMethodError(self) def _sub_datelike(self, other): - raise NotImplementedError + raise AbstractMethodError(self) @classmethod def _add_datetimelike_methods(cls): @@ -505,4 +499,6 @@ def summary(self, name=None): if self.freq: result += '\nFreq: %s' % self.freqstr + # display as values, not quoted + result = result.replace("'","") return result diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 2ceece087387e..c273906ef3d05 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -6,7 +6,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas import lib, tslib +from pandas import tslib from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, is_datetime_arraylike, is_integer_dtype, is_list_like, get_dtype_kinds) @@ -125,7 +125,7 @@ def to_pydatetime(self): accessors=DatetimeIndex._datetimelike_ops, typ='property') DatetimeProperties._add_delegate_accessors(delegate=DatetimeIndex, - accessors=["to_period","tz_localize","tz_convert"], + accessors=["to_period","tz_localize","tz_convert","normalize"], typ='method') class TimedeltaProperties(Properties): diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index b220e03fdb327..4af8c68110978 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -671,11 +671,11 @@ def _period_str_to_code(freqstr): def infer_freq(index, warn=True): """ Infer the most likely frequency given the input index. If the frequency is - uncertain, a warning will be printed + uncertain, a warning will be printed. Parameters ---------- - index : DatetimeIndex + index : DatetimeIndex or TimedeltaIndex if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True @@ -684,6 +684,7 @@ def infer_freq(index, warn=True): freq : string or None None if no discernible frequency TypeError if the index is not datetime-like + ValueError if there are less than three values. """ import pandas as pd @@ -742,7 +743,7 @@ def __init__(self, index, warn=True): @cache_readonly def deltas(self): return tslib.unique_deltas(self.values) - + @cache_readonly def deltas_asi8(self): return tslib.unique_deltas(self.index.asi8) @@ -750,7 +751,7 @@ def deltas_asi8(self): @cache_readonly def is_unique(self): return len(self.deltas) == 1 - + @cache_readonly def is_unique_asi8(self): return len(self.deltas_asi8) == 1 @@ -763,10 +764,13 @@ def get_freq(self): if _is_multiple(delta, _ONE_DAY): return self._infer_daily_rule() else: - # Possibly intraday frequency. Here we use the + # Business hourly, maybe. 17: one day / 65: one weekend + if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): + return 'BH' + # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values # will not work around DST transitions. See #8772 - if not self.is_unique_asi8: + elif not self.is_unique_asi8: return None delta = self.deltas_asi8[0] if _is_multiple(delta, _ONE_HOUR): @@ -792,6 +796,10 @@ def get_freq(self): def day_deltas(self): return [x / _ONE_DAY for x in self.deltas] + @cache_readonly + def hour_deltas(self): + return [x / _ONE_HOUR for x in self.deltas] + @cache_readonly def fields(self): return tslib.build_field_sarray(self.values) @@ -927,7 +935,9 @@ def _get_wom_rule(self): return None week_of_months = unique((self.index.day - 1) // 7) - if len(week_of_months) > 1: + # Only attempt to infer up to WOM-4. See #9425 + week_of_months = week_of_months[week_of_months < 4] + if len(week_of_months) == 0 or len(week_of_months) > 1: return None # get which week @@ -989,7 +999,7 @@ def is_subperiod(source, target): return source in ['D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'] elif _is_quarterly(target): return source in ['D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'] - elif target == 'M': + elif _is_monthly(target): return source in ['D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'] elif _is_weekly(target): return source in [target, 'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'] @@ -1048,7 +1058,7 @@ def is_superperiod(source, target): return target in ['D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'] elif _is_quarterly(source): return target in ['D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'] - elif source == 'M': + elif _is_monthly(source): return target in ['D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'] elif _is_weekly(source): return target in [source, 'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'] @@ -1093,7 +1103,12 @@ def _quarter_months_conform(source, target): def _is_quarterly(rule): rule = rule.upper() - return rule == 'Q' or rule.startswith('Q-') + return rule == 'Q' or rule.startswith('Q-') or rule.startswith('BQ') + + +def _is_monthly(rule): + rule = rule.upper() + return rule == 'M' or rule == 'BM' def _is_weekly(rule): diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 3b3542b760d6f..f55569302ca05 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -148,6 +148,9 @@ class from pandas.tseries.offsets >>> July3rd = Holiday('July 3rd', month=7, day=3, days_of_week=(0, 1, 2, 3)) """ + if offset is not None and observance is not None: + raise NotImplementedError("Cannot use both offset and observance.") + self.name = name self.year = year self.month = month @@ -203,7 +206,10 @@ def dates(self, start_date, end_date, return_name=False): end_date = Timestamp(end_date) year_offset = DateOffset(years=1) - base_date = Timestamp(datetime(start_date.year, self.month, self.day)) + base_date = Timestamp( + datetime(start_date.year, self.month, self.day), + tz=start_date.tz, + ) dates = DatetimeIndex(start=base_date, end=end_date, freq=year_offset) holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: @@ -276,7 +282,7 @@ class AbstractHolidayCalendar(object): rules = [] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2030, 12, 31)) - _holiday_cache = None + _cache = None def __init__(self, name=None, rules=None): """ @@ -348,14 +354,6 @@ def holidays(self, start=None, end=None, return_name=False): else: return holidays.index - @property - def _cache(self): - return self.__class__._holiday_cache - - @_cache.setter - def _cache(self, values): - self.__class__._holiday_cache = values - @staticmethod def merge_class(base, other): """ diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index ca5119acc8b99..745c536914e47 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1,13 +1,8 @@ # pylint: disable=E1101 import operator - from datetime import time, datetime from datetime import timedelta - import numpy as np - -import warnings - from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, ABCSeries, is_integer, is_float) @@ -597,7 +592,7 @@ def _is_dates_only(self): def _formatter_func(self): from pandas.core.format import _get_format_datetime64 formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: formatter(x, tz=self.tz) + return lambda x: "'%s'" % formatter(x, tz=self.tz) def __reduce__(self): @@ -658,14 +653,18 @@ def _sub_datelike(self, other): def _add_delta(self, delta): from pandas import TimedeltaIndex + name = self.name + if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) + # update name when delta is Index + name = com._maybe_match_name(self, delta) else: new_values = self.astype('O') + delta tz = 'UTC' if self.tz is not None else None - result = DatetimeIndex(new_values, tz=tz, freq='infer') + result = DatetimeIndex(new_values, tz=tz, name=name, freq='infer') utc = _utc() if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) @@ -673,20 +672,17 @@ def _add_delta(self, delta): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - data = self.asobject - from pandas.core.format import Datetime64Formatter - return Datetime64Formatter(values=data, - nat_rep=na_rep, - date_format=date_format, - justify='all').get_result() + from pandas.core.format import _get_format_datetime64_from_values + format = _get_format_datetime64_from_values(self, date_format) + + return tslib.format_array_from_datetime(self.asi8, + tz=self.tz, + format=format, + na_rep=na_rep) def to_datetime(self, dayfirst=False): return self.copy() - def _format_footer(self): - tagline = 'Length: %d, Freq: %s, Timezone: %s' - return tagline % (len(self), self.freqstr, self.tz) - def astype(self, dtype): dtype = np.dtype(dtype) @@ -808,6 +804,7 @@ def union(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -1043,6 +1040,7 @@ def intersection(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -1589,6 +1587,11 @@ def tz_convert(self, tz): Returns ------- normalized : DatetimeIndex + + Raises + ------ + TypeError + If DatetimeIndex is tz-naive. """ tz = tslib.maybe_get_tz(tz) @@ -1625,6 +1628,11 @@ def tz_localize(self, tz, ambiguous='raise'): Returns ------- localized : DatetimeIndex + + Raises + ------ + TypeError + If the DatetimeIndex is tz-aware and tz is not None. """ if self.tz is not None: if tz is None: @@ -1655,14 +1663,15 @@ def indexer_at_time(self, time, asof=False): from dateutil.parser import parse if asof: - raise NotImplementedError + raise NotImplementedError("'asof' argument is not supported") if isinstance(time, compat.string_types): time = parse(time).time() if time.tzinfo: # TODO - raise NotImplementedError + raise NotImplementedError("argument 'time' with timezone info is " + "not supported") time_micros = self._get_time_micros() micros = _time_to_micros(time) @@ -1694,7 +1703,8 @@ def indexer_between_time(self, start_time, end_time, include_start=True, end_time = parse(end_time).time() if start_time.tzinfo or end_time.tzinfo: - raise NotImplementedError + raise NotImplementedError("argument 'time' with timezone info is " + "not supported") time_micros = self._get_time_micros() start_micros = _time_to_micros(start_time) @@ -1773,7 +1783,8 @@ def _generate_regular_range(start, end, periods, offset): b = e - np.int64(periods) * stride tz = end.tz else: - raise NotImplementedError + raise ValueError("at least 'start' or 'end' should be specified " + "if a 'period' is given.") data = np.arange(b, e, stride, dtype=np.int64) data = DatetimeIndex._simple_new(data, None, tz=tz) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py index 104e088ee4e84..bcce64c3a71bf 100644 --- a/pandas/tseries/interval.py +++ b/pandas/tseries/interval.py @@ -1,4 +1,3 @@ -import numpy as np from pandas.core.index import Index diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index cb6bd2fb2b250..67e27bbffbf73 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -16,6 +16,7 @@ __all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', 'CBMonthEnd','CBMonthBegin', 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', + 'BusinessHour', 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253', @@ -404,10 +405,6 @@ def __repr__(self): if hasattr(self, '_named'): return self._named className = getattr(self, '_outputName', self.__class__.__name__) - attrs = [] - - if self.offset: - attrs = ['offset=%s' % repr(self.offset)] if abs(self.n) != 1: plural = 's' @@ -418,10 +415,17 @@ def __repr__(self): if self.n != 1: n_str = "%s * " % self.n - out = '<%s' % n_str + className + plural + out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + return out + + def _repr_attrs(self): + if self.offset: + attrs = ['offset=%s' % repr(self.offset)] + else: + attrs = None + out = '' if attrs: out += ': ' + ', '.join(attrs) - out += '>' return out class BusinessDay(BusinessMixin, SingleConstructorOffset): @@ -531,6 +535,234 @@ def onOffset(self, dt): return dt.weekday() < 5 +class BusinessHour(BusinessMixin, SingleConstructorOffset): + """ + DateOffset subclass representing possibly n business days + """ + _prefix = 'BH' + _anchor = 0 + + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + + # must be validated here to equality check + kwds['start'] = self._validate_time(kwds.get('start', '09:00')) + kwds['end'] = self._validate_time(kwds.get('end', '17:00')) + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + self.start = kwds.get('start', '09:00') + self.end = kwds.get('end', '17:00') + + # used for moving to next businessday + if self.n >= 0: + self.next_bday = BusinessDay(n=1) + else: + self.next_bday = BusinessDay(n=-1) + + def _validate_time(self, t_input): + from datetime import time as dt_time + import time + if isinstance(t_input, compat.string_types): + try: + t = time.strptime(t_input, '%H:%M') + return dt_time(hour=t.tm_hour, minute=t.tm_min) + except ValueError: + raise ValueError("time data must match '%H:%M' format") + elif isinstance(t_input, dt_time): + if t_input.second != 0 or t_input.microsecond != 0: + raise ValueError("time data must be specified only with hour and minute") + return t_input + else: + raise ValueError("time data must be string or datetime.time") + + def _get_daytime_flag(self): + if self.start == self.end: + raise ValueError('start and end must not be the same') + elif self.start < self.end: + return True + else: + return False + + def _repr_attrs(self): + out = super(BusinessHour, self)._repr_attrs() + attrs = ['BH=%s-%s' % (self.start.strftime('%H:%M'), + self.end.strftime('%H:%M'))] + out += ': ' + ', '.join(attrs) + return out + + def _next_opening_time(self, other): + """ + If n is positive, return tomorrow's business day opening time. + Otherwise yesterday's business day's opening time. + + Opening time always locates on BusinessDay. + Otherwise, closing time may not if business hour extends over midnight. + """ + if not self.next_bday.onOffset(other): + other = other + self.next_bday + else: + if self.n >= 0 and self.start < other.time(): + other = other + self.next_bday + elif self.n < 0 and other.time() < self.start: + other = other + self.next_bday + return datetime(other.year, other.month, other.day, + self.start.hour, self.start.minute) + + def _prev_opening_time(self, other): + """ + If n is positive, return yesterday's business day opening time. + Otherwise yesterday business day's opening time. + """ + if not self.next_bday.onOffset(other): + other = other - self.next_bday + else: + if self.n >= 0 and other.time() < self.start: + other = other - self.next_bday + elif self.n < 0 and other.time() > self.start: + other = other - self.next_bday + return datetime(other.year, other.month, other.day, + self.start.hour, self.start.minute) + + def _get_business_hours_by_sec(self): + """ + Return business hours in a day by seconds. + """ + if self._get_daytime_flag(): + # create dummy datetime to calcurate businesshours in a day + dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) + until = datetime(2014, 4, 1, self.end.hour, self.end.minute) + return tslib.tot_seconds(until - dtstart) + else: + self.daytime = False + dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) + until = datetime(2014, 4, 2, self.end.hour, self.end.minute) + return tslib.tot_seconds(until - dtstart) + + @apply_wraps + def rollback(self, dt): + """Roll provided date backward to next offset only if not on offset""" + if not self.onOffset(dt): + businesshours = self._get_business_hours_by_sec() + if self.n >= 0: + dt = self._prev_opening_time(dt) + timedelta(seconds=businesshours) + else: + dt = self._next_opening_time(dt) + timedelta(seconds=businesshours) + return dt + + @apply_wraps + def rollforward(self, dt): + """Roll provided date forward to next offset only if not on offset""" + if not self.onOffset(dt): + if self.n >= 0: + return self._next_opening_time(dt) + else: + return self._prev_opening_time(dt) + return dt + + @apply_wraps + def apply(self, other): + # calcurate here because offset is not immutable + daytime = self._get_daytime_flag() + businesshours = self._get_business_hours_by_sec() + bhdelta = timedelta(seconds=businesshours) + + if isinstance(other, datetime): + # used for detecting edge condition + nanosecond = getattr(other, 'nanosecond', 0) + # reset timezone and nanosecond + # other may be a Timestamp, thus not use replace + other = datetime(other.year, other.month, other.day, + other.hour, other.minute, + other.second, other.microsecond) + n = self.n + if n >= 0: + if (other.time() == self.end or + not self._onOffset(other, businesshours)): + other = self._next_opening_time(other) + else: + if other.time() == self.start: + # adjustment to move to previous business day + other = other - timedelta(seconds=1) + if not self._onOffset(other, businesshours): + other = self._next_opening_time(other) + other = other + bhdelta + + bd, r = divmod(abs(n * 60), businesshours // 60) + if n < 0: + bd, r = -bd, -r + + if bd != 0: + skip_bd = BusinessDay(n=bd) + # midnight busienss hour may not on BusinessDay + if not self.next_bday.onOffset(other): + remain = other - self._prev_opening_time(other) + other = self._next_opening_time(other + skip_bd) + remain + else: + other = other + skip_bd + + hours, minutes = divmod(r, 60) + result = other + timedelta(hours=hours, minutes=minutes) + + # because of previous adjustment, time will be larger than start + if ((daytime and (result.time() < self.start or self.end < result.time())) or + not daytime and (self.end < result.time() < self.start)): + if n >= 0: + bday_edge = self._prev_opening_time(other) + bday_edge = bday_edge + bhdelta + # calcurate remainder + bday_remain = result - bday_edge + result = self._next_opening_time(other) + result += bday_remain + else: + bday_edge = self._next_opening_time(other) + bday_remain = result - bday_edge + result = self._next_opening_time(result) + bhdelta + result += bday_remain + # edge handling + if n >= 0: + if result.time() == self.end: + result = self._next_opening_time(result) + else: + if result.time() == self.start and nanosecond == 0: + # adjustment to move to previous business day + result = self._next_opening_time(result- timedelta(seconds=1)) +bhdelta + + return result + else: + raise ApplyTypeError('Only know how to combine business hour with ') + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + + if dt.tzinfo is not None: + dt = datetime(dt.year, dt.month, dt.day, dt.hour, + dt.minute, dt.second, dt.microsecond) + # Valid BH can be on the different BusinessDay during midnight + # Distinguish by the time spent from previous opening time + businesshours = self._get_business_hours_by_sec() + return self._onOffset(dt, businesshours) + + def _onOffset(self, dt, businesshours): + """ + Slight speedups using calcurated values + """ + # if self.normalize and not _is_normalized(dt): + # return False + # Valid BH can be on the different BusinessDay during midnight + # Distinguish by the time spent from previous opening time + if self.n >= 0: + op = self._prev_opening_time(dt) + else: + op = self._next_opening_time(dt) + span = tslib.tot_seconds(dt - op) + if span <= businesshours: + return True + else: + return False + + class CustomBusinessDay(BusinessDay): """ **EXPERIMENTAL** DateOffset subclass representing possibly n business days @@ -2250,6 +2482,7 @@ def generate_range(start=None, end=None, periods=None, BusinessMonthEnd, # 'BM' BQuarterEnd, # 'BQ' BQuarterBegin, # 'BQS' + BusinessHour, # 'BH' CustomBusinessDay, # 'C' CustomBusinessMonthEnd, # 'CBM' CustomBusinessMonthBegin, # 'CBMS' diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index b1f0ba1f127fa..6627047f0c335 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,10 +1,6 @@ # pylint: disable=E1101,E1103,W0232 -import operator - -from datetime import datetime, date, timedelta +from datetime import datetime, timedelta import numpy as np -from pandas.core.base import PandasObject - import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index @@ -114,20 +110,20 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): Parameters ---------- - data : array-like (1-dimensional), optional + data : array-like (1-dimensional), optional Optional period-like data to construct index with dtype : NumPy dtype (default: i8) - copy : bool + copy : bool Make a copy of input ndarray freq : string or period object, optional One of pandas period strings or corresponding objects start : starting value, period-like, optional If data is None, used as the start point in generating regular period data. - periods : int, optional, > 0 + periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence over end argument - end : end value, period-like, optional + end : end value, period-like, optional If periods is none, generated index will extend to first conforming period on or just past end argument year : int, array, or Series, default None @@ -293,6 +289,10 @@ def _to_embed(self, keep_tz=False): """ return an array repr of this object, potentially casting to object """ return self.asobject.values + @property + def _formatter_func(self): + return lambda x: "'%s'" % x + def asof_locs(self, where, mask): """ where : array of timestamps @@ -355,6 +355,44 @@ def freqstr(self): return self.freq def asfreq(self, freq=None, how='E'): + """ + Convert the PeriodIndex to the specified frequency `freq`. + + Parameters + ---------- + + freq : str + a frequency + how : str {'E', 'S'} + 'E', 'END', or 'FINISH' for end, + 'S', 'START', or 'BEGIN' for start. + Whether the elements should be aligned to the end + or start within pa period. January 31st ('END') vs. + Janury 1st ('START') for example. + + Returns + ------- + + new : PeriodIndex with the new frequency + + Examples + -------- + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx + + [2010, ..., 2015] + Length: 6, Freq: A-DEC + + >>> pidx.asfreq('M') + + [2010-12, ..., 2015-12] + Length: 6, Freq: M + + >>> pidx.asfreq('M', how='S') + + [2010-01, ..., 2015-01] + Length: 6, Freq: M + """ how = _validate_end_alias(how) freq = frequencies.get_standard_freq(freq) @@ -387,7 +425,7 @@ def to_datetime(self, dayfirst=False): qyear = _field_accessor('qyear', 1) days_in_month = _field_accessor('days_in_month', 11, "The number of days in the month") daysinmonth = days_in_month - + def _get_object_array(self): freq = self.freq return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False) @@ -463,7 +501,6 @@ def shift(self, n): ---------- n : int Periods to shift by - freq : freq string Returns ------- @@ -642,6 +679,8 @@ def join(self, other, how='left', level=None, return_indexers=False): return self._apply_meta(result) def _assert_can_do_setop(self, other): + super(PeriodIndex, self)._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): raise ValueError('can only call with other PeriodIndex-ed objects') @@ -687,7 +726,7 @@ def _format_native_types(self, na_rep=u('NaT'), **kwargs): imask = ~mask values[imask] = np.array([u('%s') % dt for dt in values[imask]]) - return values.tolist() + return values def __array_finalize__(self, obj): if not self.ndim: # pragma: no cover @@ -697,10 +736,6 @@ def __array_finalize__(self, obj): self.name = getattr(obj, 'name', None) self._reset_identity() - def _format_footer(self): - tagline = 'Length: %d, Freq: %s' - return tagline % (len(self), self.freqstr) - def take(self, indices, axis=None): """ Analogous to ndarray.take @@ -936,8 +971,8 @@ def period_range(start=None, end=None, periods=None, freq='D', name=None): Parameters ---------- - start : - end : + start : starting value, period-like, optional + end : ending value, period-like, optional periods : int, default None Number of periods in the index freq : str/DateOffset, default 'D' diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 899d2bfdc9c76..9d28fa11f646f 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -5,17 +5,13 @@ #!!! TODO: Use the fact that axis can have units to simplify the process from matplotlib import pylab - -import numpy as np - -from pandas import isnull from pandas.tseries.period import Period from pandas.tseries.offsets import DateOffset import pandas.tseries.frequencies as frequencies from pandas.tseries.index import DatetimeIndex import pandas.core.common as com -from pandas.tseries.converter import (PeriodConverter, TimeSeries_DateLocator, +from pandas.tseries.converter import (TimeSeries_DateLocator, TimeSeries_DateFormatter) #---------------------------------------------------------------------- diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 7607bef0f1d71..53c1292204f71 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,14 +1,11 @@ from datetime import timedelta - import numpy as np - from pandas.core.groupby import BinGrouper, Grouper from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod from pandas.tseries.index import DatetimeIndex, date_range from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds from pandas.tseries.period import PeriodIndex, period_range -import pandas.tseries.tools as tools import pandas.core.common as com import pandas.compat as compat @@ -373,11 +370,11 @@ def _take_new_index(obj, indexer, new_index, axis=0): return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: - raise NotImplementedError + raise NotImplementedError("axis 1 is not supported") return DataFrame(obj._data.reindex_indexer( new_axis=new_index, indexer=indexer, axis=1)) else: - raise NotImplementedError + raise ValueError("'obj' should be either a Series or a DataFrame") def _get_range_edges(first, last, offset, closed='left', base=0): @@ -467,7 +464,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False): """ if isinstance(obj.index, PeriodIndex): if method is not None: - raise NotImplementedError + raise NotImplementedError("'method' argument is not supported") if how is None: how = 'E' @@ -480,6 +477,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False): if len(obj.index) == 0: return obj.copy() dti = date_range(obj.index[0], obj.index[-1], freq=freq) + dti.name = obj.index.name rs = obj.reindex(dti, method=method) if normalize: rs.index = rs.index.normalize() diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index e01ff54feab57..de68dd763d68c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -1,17 +1,13 @@ """ implement the TimedeltaIndex """ -import operator -import datetime from datetime import timedelta import numpy as np - from pandas.core.common import (ABCSeries, _TD_DTYPE, _INT64_DTYPE, is_timedelta64_dtype, _maybe_box, _values_from_object, isnull, is_integer, is_float) from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u -from pandas.core.base import PandasObject from pandas.util.decorators import cache_readonly from pandas.tseries.frequencies import to_offset import pandas.core.common as com @@ -140,7 +136,7 @@ def __new__(cls, data=None, unit=None, copy=False, name=None, closed=None, verify_integrity=True, **kwargs): - if isinstance(data, TimedeltaIndex) and freq is None: + if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: data = data.copy() return data @@ -274,10 +270,6 @@ def _formatter_func(self): from pandas.core.format import _get_format_timedelta64 return _get_format_timedelta64(self, box=True) - def _format_footer(self): - tagline = 'Length: %d, Freq: %s' - return tagline % (len(self), self.freqstr) - def __setstate__(self, state): """Necessary for making this object picklable""" if isinstance(state, dict): @@ -289,12 +281,15 @@ def __setstate__(self, state): def _add_delta(self, delta): if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) + name = self.name elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) + # update name when delta is index + name = com._maybe_match_name(self, delta) else: raise ValueError("cannot add the type {0} to a TimedeltaIndex".format(type(delta))) - result = TimedeltaIndex(new_values, freq='infer') + result = TimedeltaIndex(new_values, freq='infer', name=name) return result def _evaluate_with_timedelta_like(self, other, op, opstr): @@ -441,12 +436,12 @@ def union(self, other): ------- y : Index or TimedeltaIndex """ - if _is_convertible_to_index(other): + self._assert_can_do_setop(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) - except TypeError: + except (TypeError, ValueError): pass - this, other = self, other if this._can_fast_union(other): @@ -586,6 +581,7 @@ def intersection(self, other): ------- y : Index or TimedeltaIndex """ + self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) @@ -927,7 +923,8 @@ def _generate_regular_range(start, end, periods, offset): e = Timedelta(end).value + stride b = e - periods * stride else: - raise NotImplementedError + raise ValueError("at least 'start' or 'end' should be specified " + "if a 'period' is given.") data = np.arange(b, e, stride, dtype=np.int64) data = TimedeltaIndex._simple_new(data, None) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index c42802bdb31ad..55482401a20f4 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -119,29 +119,24 @@ def test_representation(self): idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern') - exp1 = """ -Length: 0, Freq: D, Timezone: None""" - exp2 = """ -[2011-01-01] -Length: 1, Freq: D, Timezone: None""" - exp3 = """ -[2011-01-01, 2011-01-02] -Length: 2, Freq: D, Timezone: None""" - exp4 = """ -[2011-01-01, ..., 2011-01-03] -Length: 3, Freq: D, Timezone: None""" - exp5 = """ -[2011-01-01 09:00:00+09:00, ..., 2011-01-01 11:00:00+09:00] -Length: 3, Freq: H, Timezone: Asia/Tokyo""" - exp6 = """ -[2011-01-01 09:00:00-05:00, ..., NaT] -Length: 3, Freq: None, Timezone: US/Eastern""" + exp1 = """DatetimeIndex([], dtype='datetime64[ns]', freq='D', tz=None)""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], - [exp1, exp2, exp3, exp4, exp5, exp6]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - self.assertEqual(result, expected) + exp2 = """DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D', tz=None)""" + + exp3 = """DatetimeIndex(['2011-01-01', '2011-01-02'], dtype='datetime64[ns]', freq='D', tz=None)""" + + exp4 = """DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq='D', tz=None)""" + + exp5 = """DatetimeIndex(['2011-01-01 09:00:00+09:00', '2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00'], dtype='datetime64[ns]', freq='H', tz='Asia/Tokyo')""" + + exp6 = """DatetimeIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', 'NaT'], dtype='datetime64[ns]', freq=None, tz='US/Eastern')""" + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], + [exp1, exp2, exp3, exp4, exp5, exp6]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) def test_summary(self): # GH9116 @@ -372,27 +367,22 @@ def test_representation(self): idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" - exp1 = """ -Length: 0, Freq: D""" - exp2 = """ -['1 days'] -Length: 1, Freq: D""" - exp3 = """ -['1 days', '2 days'] -Length: 2, Freq: D""" - exp4 = """ -['1 days', ..., '3 days'] -Length: 3, Freq: D""" - exp5 = """ -['1 days 00:00:01', ..., '3 days 00:00:00'] -Length: 3, Freq: None""" + exp2 = """TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', freq='D')""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - self.assertEqual(result, expected) + exp3 = """TimedeltaIndex(['1 days', '2 days'], dtype='timedelta64[ns]', freq='D')""" + + exp4 = """TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq='D')""" + + exp5 = """TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', '3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)""" + + with pd.option_context('display.width',300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) def test_summary(self): # GH9116 @@ -404,13 +394,13 @@ def test_summary(self): exp1 = """TimedeltaIndex: 0 entries Freq: D""" - exp2 = """TimedeltaIndex: 1 entries, '1 days' to '1 days' + exp2 = """TimedeltaIndex: 1 entries, 1 days to 1 days Freq: D""" - exp3 = """TimedeltaIndex: 2 entries, '1 days' to '2 days' + exp3 = """TimedeltaIndex: 2 entries, 1 days to 2 days Freq: D""" - exp4 = """TimedeltaIndex: 3 entries, '1 days' to '3 days' + exp4 = """TimedeltaIndex: 3 entries, 1 days to 3 days Freq: D""" - exp5 = """TimedeltaIndex: 3 entries, '1 days 00:00:01' to '3 days 00:00:00'""" + exp5 = """TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days 00:00:00""" for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): @@ -483,8 +473,8 @@ def test_ops_compat(self): tm.assert_index_equal(result,expected) # divide with nats - rng = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') - expected = Float64Index([12,np.nan,24]) + rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + expected = Float64Index([12, np.nan, 24], name='foo') for offset in offsets: result = rng / offset tm.assert_index_equal(result,expected) @@ -495,8 +485,8 @@ def test_ops_compat(self): def test_subtraction_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') - dti = date_range('20130101',periods=3) + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') td = Timedelta('1 days') dt = Timestamp('20130101') @@ -505,29 +495,29 @@ def test_subtraction_ops(self): self.assertRaises(TypeError, lambda : td - dt) self.assertRaises(TypeError, lambda : td - dti) - result = dt-dti - expected = TimedeltaIndex(['0 days','-1 days','-2 days']) - tm.assert_index_equal(result,expected) + result = dt - dti + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') + tm.assert_index_equal(result, expected) - result = dti-dt - expected = TimedeltaIndex(['0 days','1 days','2 days']) - tm.assert_index_equal(result,expected) + result = dti - dt + expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') + tm.assert_index_equal(result, expected) - result = tdi-td - expected = TimedeltaIndex(['0 days',pd.NaT,'1 days']) - tm.assert_index_equal(result,expected) + result = tdi - td + expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) - result = td-tdi - expected = TimedeltaIndex(['0 days',pd.NaT,'-1 days']) - tm.assert_index_equal(result,expected) + result = td - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) - result = dti-td - expected = DatetimeIndex(['20121231','20130101','20130102']) - tm.assert_index_equal(result,expected) + result = dti - td + expected = DatetimeIndex(['20121231', '20130101', '20130102'], name='bar') + tm.assert_index_equal(result, expected, check_names=False) - result = dt-tdi - expected = DatetimeIndex(['20121231',pd.NaT,'20121230']) - tm.assert_index_equal(result,expected) + result = dt - tdi + expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') + tm.assert_index_equal(result, expected) def test_subtraction_ops_with_tz(self): @@ -644,46 +634,46 @@ def test_dti_dti_deprecated_ops(self): def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') - dti = date_range('20130101',periods=3) + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') td = Timedelta('1 days') dt = Timestamp('20130101') - result = tdi-tdi - expected = TimedeltaIndex(['0 days',pd.NaT,'0 days']) - tm.assert_index_equal(result,expected) + result = tdi - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') + tm.assert_index_equal(result, expected) - result = tdi+tdi - expected = TimedeltaIndex(['2 days',pd.NaT,'4 days']) - tm.assert_index_equal(result,expected) + result = tdi + tdi + expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') + tm.assert_index_equal(result, expected) - result = dti-tdi - expected = DatetimeIndex(['20121231',pd.NaT,'20130101']) - tm.assert_index_equal(result,expected) + result = dti - tdi # name will be reset + expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) + tm.assert_index_equal(result, expected) def test_addition_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') - dti = date_range('20130101',periods=3) + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') td = Timedelta('1 days') dt = Timestamp('20130101') result = tdi + dt - expected = DatetimeIndex(['20130102',pd.NaT,'20130103']) - tm.assert_index_equal(result,expected) + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) result = dt + tdi - expected = DatetimeIndex(['20130102',pd.NaT,'20130103']) - tm.assert_index_equal(result,expected) + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) result = td + tdi - expected = TimedeltaIndex(['2 days',pd.NaT,'3 days']) - tm.assert_index_equal(result,expected) + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) result = tdi + td - expected = TimedeltaIndex(['2 days',pd.NaT,'3 days']) - tm.assert_index_equal(result,expected) + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) # unequal length self.assertRaises(ValueError, lambda : tdi + dti[0:1]) @@ -695,21 +685,21 @@ def test_addition_ops(self): # this is a union! #self.assertRaises(TypeError, lambda : Int64Index([1,2,3]) + tdi) - result = tdi + dti - expected = DatetimeIndex(['20130102',pd.NaT,'20130105']) - tm.assert_index_equal(result,expected) + result = tdi + dti # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) - result = dti + tdi - expected = DatetimeIndex(['20130102',pd.NaT,'20130105']) - tm.assert_index_equal(result,expected) + result = dti + tdi # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) result = dt + td expected = Timestamp('20130102') - self.assertEqual(result,expected) + self.assertEqual(result, expected) result = td + dt expected = Timestamp('20130102') - self.assertEqual(result,expected) + self.assertEqual(result, expected) def test_value_counts_unique(self): # GH 7735 @@ -745,6 +735,13 @@ def test_nonunique_contains(self): ['00:01:00', '00:01:00', '00:00:01'])): tm.assertIn(idx[0], idx) + def test_unknown_attribute(self): + #GH 9680 + tdi = pd.timedelta_range(start=0,periods=10,freq='1s') + ts = pd.Series(np.random.normal(size=10),index=tdi) + self.assertNotIn('foo',ts.__dict__.keys()) + self.assertRaises(AttributeError,lambda : ts.foo) + class TestPeriodIndexOps(Ops): @@ -835,32 +832,23 @@ def test_representation(self): idx8 = pd.period_range('2013Q1', periods=2, freq="Q") idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - exp1 = """ -Length: 0, Freq: D""" - exp2 = """ -[2011-01-01] -Length: 1, Freq: D""" - exp3 = """ -[2011-01-01, 2011-01-02] -Length: 2, Freq: D""" - exp4 = """ -[2011-01-01, ..., 2011-01-03] -Length: 3, Freq: D""" - exp5 = """ -[2011, ..., 2013] -Length: 3, Freq: A-DEC""" - exp6 = """ -[2011-01-01 09:00, ..., NaT] -Length: 3, Freq: H""" - exp7 = """ -[2013Q1] -Length: 1, Freq: Q-DEC""" - exp8 = """ -[2013Q1, 2013Q2] -Length: 2, Freq: Q-DEC""" - exp9 = """ -[2013Q1, ..., 2013Q3] -Length: 3, Freq: Q-DEC""" + exp1 = """PeriodIndex([], dtype='int64', freq='D')""" + + exp2 = """PeriodIndex(['2011-01-01'], dtype='int64', freq='D')""" + + exp3 = """PeriodIndex(['2011-01-01', '2011-01-02'], dtype='int64', freq='D')""" + + exp4 = """PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='int64', freq='D')""" + + exp5 = """PeriodIndex(['2011', '2012', '2013'], dtype='int64', freq='A-DEC')""" + + exp6 = """PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], dtype='int64', freq='H')""" + + exp7 = """PeriodIndex(['2013Q1'], dtype='int64', freq='Q-DEC')""" + + exp8 = """PeriodIndex(['2013Q1', '2013Q2'], dtype='int64', freq='Q-DEC')""" + + exp9 = """PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], dtype='int64', freq='Q-DEC')""" for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]): diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 841d81c15b4e9..69b1d84670d45 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -441,7 +441,7 @@ def test_month_range_union_tz_pytz(self): def test_month_range_union_tz_dateutil(self): _skip_if_windows_python_3() tm._skip_if_no_dateutil() - from dateutil.zoneinfo import gettz as timezone + from pandas.tslib import _dateutil_gettz as timezone tz = timezone('US/Eastern') early_start = datetime(2011, 1, 1) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 965c198eb7c95..823c762c692e5 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -196,6 +196,7 @@ def _check_tick(self, base_delta, code): index = _dti([b + base_delta * j for j in range(3)] + [b + base_delta * 7]) + self.assertIsNone(frequencies.infer_freq(index)) def test_weekly(self): @@ -211,6 +212,16 @@ def test_week_of_month(self): for i in range(1, 5): self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day)) + def test_fifth_week_of_month(self): + # Only supports freq up to WOM-4. See #9425 + func = lambda: date_range('2014-01-01', freq='WOM-5MON') + self.assertRaises(ValueError, func) + + def test_fifth_week_of_month_infer(self): + # Only attempts to infer up to WOM-4. See #9425 + index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) + assert frequencies.infer_freq(index) is None + def test_week_of_month_fake(self): #All of these dates are on same day of week and are 4 or 5 weeks apart index = DatetimeIndex(["2013-08-27","2013-10-01","2013-10-29","2013-11-26"]) @@ -324,10 +335,40 @@ def test_infer_freq_tz_transition(self): idx = date_range(date_pair[0], date_pair[1], freq=freq, tz=tz) print(idx) self.assertEqual(idx.inferred_freq, freq) - + index = date_range("2013-11-03", periods=5, freq="3H").tz_localize("America/Chicago") self.assertIsNone(index.inferred_freq) + def test_infer_freq_businesshour(self): + # GH 7905 + idx = DatetimeIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', + '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00']) + # hourly freq in a day must result in 'H' + self.assertEqual(idx.inferred_freq, 'H') + + idx = DatetimeIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', + '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00', + '2014-07-01 15:00', '2014-07-01 16:00', + '2014-07-02 09:00', '2014-07-02 10:00', '2014-07-02 11:00']) + self.assertEqual(idx.inferred_freq, 'BH') + + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00', + '2014-07-07 09:00', '2014-07-07 10:00', '2014-07-07 11:00']) + self.assertEqual(idx.inferred_freq, 'BH') + + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00', + '2014-07-07 09:00', '2014-07-07 10:00', '2014-07-07 11:00', + '2014-07-07 12:00', '2014-07-07 13:00', '2014-07-07 14:00', + '2014-07-07 15:00', '2014-07-07 16:00', + '2014-07-08 09:00', '2014-07-08 10:00', '2014-07-08 11:00', + '2014-07-08 12:00', '2014-07-08 13:00', '2014-07-08 14:00', + '2014-07-08 15:00', '2014-07-08 16:00']) + self.assertEqual(idx.inferred_freq, 'BH') + def test_not_monotonic(self): rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) rng = rng[::-1] diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py index c2300481eca43..7d233ba78e7b6 100644 --- a/pandas/tseries/tests/test_holiday.py +++ b/pandas/tseries/tests/test_holiday.py @@ -1,6 +1,7 @@ from datetime import datetime import pandas.util.testing as tm +from pandas import DatetimeIndex from pandas.tseries.holiday import ( USFederalHolidayCalendar, USMemorialDay, USThanksgivingDay, nearest_workday, next_monday_or_tuesday, next_monday, @@ -9,6 +10,7 @@ HolidayCalendarFactory, next_workday, previous_workday, before_nearest_workday, EasterMonday, GoodFriday, after_nearest_workday, weekend_to_monday) +from pytz import utc import nose class TestCalendar(tm.TestCase): @@ -49,93 +51,148 @@ def test_calendar(self): self.assertEqual(list(holidays_2.to_pydatetime()), self.holiday_list) + def test_calendar_caching(self): + # Test for issue #9552 + + class TestCalendar(AbstractHolidayCalendar): + def __init__(self, name=None, rules=None): + super(TestCalendar, self).__init__( + name=name, + rules=rules + ) + + jan1 = TestCalendar(rules=[Holiday('jan1', year=2015, month=1, day=1)]) + jan2 = TestCalendar(rules=[Holiday('jan2', year=2015, month=1, day=2)]) + + tm.assert_index_equal( + jan1.holidays(), + DatetimeIndex(['01-Jan-2015']) + ) + tm.assert_index_equal( + jan2.holidays(), + DatetimeIndex(['02-Jan-2015']) + ) + + class TestHoliday(tm.TestCase): def setUp(self): self.start_date = datetime(2011, 1, 1) self.end_date = datetime(2020, 12, 31) + def check_results(self, holiday, start, end, expected): + self.assertEqual(list(holiday.dates(start, end)), expected) + # Verify that timezone info is preserved. + self.assertEqual( + list( + holiday.dates( + utc.localize(Timestamp(start)), + utc.localize(Timestamp(end)), + ) + ), + [utc.localize(dt) for dt in expected], + ) + def test_usmemorialday(self): - holidays = USMemorialDay.dates(self.start_date, - self.end_date) - holidayList = [ - datetime(2011, 5, 30), - datetime(2012, 5, 28), - datetime(2013, 5, 27), - datetime(2014, 5, 26), - datetime(2015, 5, 25), - datetime(2016, 5, 30), - datetime(2017, 5, 29), - datetime(2018, 5, 28), - datetime(2019, 5, 27), - datetime(2020, 5, 25), - ] - self.assertEqual(list(holidays), holidayList) + self.check_results( + holiday=USMemorialDay, + start=self.start_date, + end=self.end_date, + expected=[ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ], + ) def test_non_observed_holiday(self): - july_3rd = Holiday('July 4th Eve', month=7, day=3) - result = july_3rd.dates("2001-01-01", "2003-03-03") - expected = [Timestamp('2001-07-03 00:00:00'), - Timestamp('2002-07-03 00:00:00')] - self.assertEqual(list(result), expected) - july_3rd = Holiday('July 4th Eve', month=7, day=3, - days_of_week=(0, 1, 2, 3)) - result = july_3rd.dates("2001-01-01", "2008-03-03") - expected = [Timestamp('2001-07-03 00:00:00'), - Timestamp('2002-07-03 00:00:00'), - Timestamp('2003-07-03 00:00:00'), - Timestamp('2006-07-03 00:00:00'), - Timestamp('2007-07-03 00:00:00')] - self.assertEqual(list(result), expected) + + self.check_results( + Holiday('July 4th Eve', month=7, day=3), + start="2001-01-01", + end="2003-03-03", + expected=[ + Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00') + ] + ) + + self.check_results( + Holiday('July 4th Eve', month=7, day=3, days_of_week=(0, 1, 2, 3)), + start="2001-01-01", + end="2008-03-03", + expected=[ + Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00'), + Timestamp('2003-07-03 00:00:00'), + Timestamp('2006-07-03 00:00:00'), + Timestamp('2007-07-03 00:00:00'), + ] + ) def test_easter(self): - holidays = EasterMonday.dates(self.start_date, - self.end_date) - holidayList = [Timestamp('2011-04-25 00:00:00'), - Timestamp('2012-04-09 00:00:00'), - Timestamp('2013-04-01 00:00:00'), - Timestamp('2014-04-21 00:00:00'), - Timestamp('2015-04-06 00:00:00'), - Timestamp('2016-03-28 00:00:00'), - Timestamp('2017-04-17 00:00:00'), - Timestamp('2018-04-02 00:00:00'), - Timestamp('2019-04-22 00:00:00'), - Timestamp('2020-04-13 00:00:00')] - - - self.assertEqual(list(holidays), holidayList) - holidays = GoodFriday.dates(self.start_date, - self.end_date) - holidayList = [Timestamp('2011-04-22 00:00:00'), - Timestamp('2012-04-06 00:00:00'), - Timestamp('2013-03-29 00:00:00'), - Timestamp('2014-04-18 00:00:00'), - Timestamp('2015-04-03 00:00:00'), - Timestamp('2016-03-25 00:00:00'), - Timestamp('2017-04-14 00:00:00'), - Timestamp('2018-03-30 00:00:00'), - Timestamp('2019-04-19 00:00:00'), - Timestamp('2020-04-10 00:00:00')] - self.assertEqual(list(holidays), holidayList) - + + self.check_results( + EasterMonday, + start=self.start_date, + end=self.end_date, + expected=[ + Timestamp('2011-04-25 00:00:00'), + Timestamp('2012-04-09 00:00:00'), + Timestamp('2013-04-01 00:00:00'), + Timestamp('2014-04-21 00:00:00'), + Timestamp('2015-04-06 00:00:00'), + Timestamp('2016-03-28 00:00:00'), + Timestamp('2017-04-17 00:00:00'), + Timestamp('2018-04-02 00:00:00'), + Timestamp('2019-04-22 00:00:00'), + Timestamp('2020-04-13 00:00:00'), + ], + ) + self.check_results( + GoodFriday, + start=self.start_date, + end=self.end_date, + expected=[ + Timestamp('2011-04-22 00:00:00'), + Timestamp('2012-04-06 00:00:00'), + Timestamp('2013-03-29 00:00:00'), + Timestamp('2014-04-18 00:00:00'), + Timestamp('2015-04-03 00:00:00'), + Timestamp('2016-03-25 00:00:00'), + Timestamp('2017-04-14 00:00:00'), + Timestamp('2018-03-30 00:00:00'), + Timestamp('2019-04-19 00:00:00'), + Timestamp('2020-04-10 00:00:00'), + ], + ) def test_usthanksgivingday(self): - holidays = USThanksgivingDay.dates(self.start_date, - self.end_date) - holidayList = [ - datetime(2011, 11, 24), - datetime(2012, 11, 22), - datetime(2013, 11, 28), - datetime(2014, 11, 27), - datetime(2015, 11, 26), - datetime(2016, 11, 24), - datetime(2017, 11, 23), - datetime(2018, 11, 22), - datetime(2019, 11, 28), - datetime(2020, 11, 26), - ] - - self.assertEqual(list(holidays), holidayList) + + self.check_results( + USThanksgivingDay, + start=self.start_date, + end=self.end_date, + expected=[ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ], + ) def test_argument_types(self): holidays = USThanksgivingDay.dates(self.start_date, diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 0793508b4912c..a051560617604 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -10,7 +10,7 @@ import numpy as np from pandas.core.datetools import ( - bday, BDay, CDay, BQuarterEnd, BMonthEnd, + bday, BDay, CDay, BQuarterEnd, BMonthEnd, BusinessHour, CBMonthEnd, CBMonthBegin, BYearEnd, MonthEnd, MonthBegin, BYearBegin, CustomBusinessDay, QuarterBegin, BQuarterBegin, BMonthBegin, DateOffset, Week, @@ -23,7 +23,6 @@ from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache, date_range from pandas.tseries.tools import parse_time_string, DateParseError import pandas.tseries.offsets as offsets - from pandas.io.pickle import read_pickle from pandas.tslib import NaT, Timestamp, Timedelta import pandas.tslib as tslib @@ -133,7 +132,11 @@ def test_apply_out_of_range(self): # try to create an out-of-bounds result timestamp; if we can't create the offset # skip try: - offset = self._get_offset(self._offset, value=10000) + if self._offset is BusinessHour: + # Using 10000 in BusinessHour fails in tz check because of DST difference + offset = self._get_offset(self._offset, value=100000) + else: + offset = self._get_offset(self._offset, value=10000) result = Timestamp('20080101') + offset self.assertIsInstance(result, datetime) @@ -179,6 +182,7 @@ def setUp(self): 'BQuarterBegin': Timestamp('2011-03-01 09:00:00'), 'QuarterEnd': Timestamp('2011-03-31 09:00:00'), 'BQuarterEnd': Timestamp('2011-03-31 09:00:00'), + 'BusinessHour': Timestamp('2011-01-03 10:00:00'), 'WeekOfMonth': Timestamp('2011-01-08 09:00:00'), 'LastWeekOfMonth': Timestamp('2011-01-29 09:00:00'), 'FY5253Quarter': Timestamp('2011-01-25 09:00:00'), @@ -278,6 +282,8 @@ def test_rollforward(self): for n in no_changes: expecteds[n] = Timestamp('2011/01/01 09:00') + expecteds['BusinessHour'] = Timestamp('2011-01-03 09:00:00') + # but be changed when normalize=True norm_expected = expecteds.copy() for k in norm_expected: @@ -321,6 +327,7 @@ def test_rollback(self): 'BQuarterBegin': Timestamp('2010-12-01 09:00:00'), 'QuarterEnd': Timestamp('2010-12-31 09:00:00'), 'BQuarterEnd': Timestamp('2010-12-31 09:00:00'), + 'BusinessHour': Timestamp('2010-12-31 17:00:00'), 'WeekOfMonth': Timestamp('2010-12-11 09:00:00'), 'LastWeekOfMonth': Timestamp('2010-12-25 09:00:00'), 'FY5253Quarter': Timestamp('2010-10-26 09:00:00'), @@ -371,6 +378,10 @@ def test_onOffset(self): offset_n = self._get_offset(offset, normalize=True) self.assertFalse(offset_n.onOffset(dt)) + if offset is BusinessHour: + # In default BusinessHour (9:00-17:00), normalized time + # cannot be in business hour range + continue date = datetime(dt.year, dt.month, dt.day) self.assertTrue(offset_n.onOffset(date)) @@ -642,6 +653,593 @@ def test_offsets_compare_equal(self): self.assertFalse(offset1 != offset2) +class TestBusinessHour(Base): + _multiprocess_can_split_ = True + _offset = BusinessHour + + def setUp(self): + self.d = datetime(2014, 7, 1, 10, 00) + + self.offset1 = BusinessHour() + self.offset2 = BusinessHour(n=3) + + self.offset3 = BusinessHour(n=-1) + self.offset4 = BusinessHour(n=-4) + + from datetime import time as dt_time + self.offset5 = BusinessHour(start=dt_time(11, 0), end=dt_time(14, 30)) + self.offset6 = BusinessHour(start='20:00', end='05:00') + self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), end=dt_time(6, 30)) + + def test_constructor_errors(self): + from datetime import time as dt_time + with tm.assertRaises(ValueError): + BusinessHour(start=dt_time(11, 0, 5)) + with tm.assertRaises(ValueError): + BusinessHour(start='AAA') + with tm.assertRaises(ValueError): + BusinessHour(start='14:00:05') + + def test_different_normalize_equals(self): + # equivalent in this special case + offset = self._offset() + offset2 = self._offset() + offset2.normalize = True + self.assertEqual(offset, offset2) + + def test_repr(self): + self.assertEqual(repr(self.offset1), '') + self.assertEqual(repr(self.offset2), '<3 * BusinessHours: BH=09:00-17:00>') + self.assertEqual(repr(self.offset3), '<-1 * BusinessHour: BH=09:00-17:00>') + self.assertEqual(repr(self.offset4), '<-4 * BusinessHours: BH=09:00-17:00>') + + self.assertEqual(repr(self.offset5), '') + self.assertEqual(repr(self.offset6), '') + self.assertEqual(repr(self.offset7), '<-2 * BusinessHours: BH=21:30-06:30>') + + def test_with_offset(self): + expected = Timestamp('2014-07-01 13:00') + + self.assertEqual(self.d + BusinessHour() * 3, expected) + self.assertEqual(self.d + BusinessHour(n=3), expected) + + def testEQ(self): + for offset in [self.offset1, self.offset2, self.offset3, self.offset4]: + self.assertEqual(offset, offset) + + self.assertNotEqual(BusinessHour(), BusinessHour(-1)) + self.assertEqual(BusinessHour(start='09:00'), BusinessHour()) + self.assertNotEqual(BusinessHour(start='09:00'), BusinessHour(start='09:01')) + self.assertNotEqual(BusinessHour(start='09:00', end='17:00'), + BusinessHour(start='17:00', end='09:01')) + + def test_hash(self): + self.assertEqual(hash(self.offset2), hash(self.offset2)) + + def testCall(self): + self.assertEqual(self.offset1(self.d), datetime(2014, 7, 1, 11)) + self.assertEqual(self.offset2(self.d), datetime(2014, 7, 1, 13)) + self.assertEqual(self.offset3(self.d), datetime(2014, 6, 30, 17)) + self.assertEqual(self.offset4(self.d), datetime(2014, 6, 30, 14)) + + def testRAdd(self): + self.assertEqual(self.d + self.offset2, self.offset2 + self.d) + + def testSub(self): + off = self.offset2 + self.assertRaises(Exception, off.__sub__, self.d) + self.assertEqual(2 * off - off, off) + + self.assertEqual(self.d - self.offset2, self.d + self._offset(-3)) + + def testRSub(self): + self.assertEqual(self.d - self.offset2, (-self.offset2).apply(self.d)) + + def testMult1(self): + self.assertEqual(self.d + 5 * self.offset1, self.d + self._offset(5)) + + def testMult2(self): + self.assertEqual(self.d + (-3 * self._offset(-2)), + self.d + self._offset(6)) + + def testRollback1(self): + self.assertEqual(self.offset1.rollback(self.d), self.d) + self.assertEqual(self.offset2.rollback(self.d), self.d) + self.assertEqual(self.offset3.rollback(self.d), self.d) + self.assertEqual(self.offset4.rollback(self.d), self.d) + self.assertEqual(self.offset5.rollback(self.d), datetime(2014, 6, 30, 14, 30)) + self.assertEqual(self.offset6.rollback(self.d), datetime(2014, 7, 1, 5, 0)) + self.assertEqual(self.offset7.rollback(self.d), datetime(2014, 7, 1, 6, 30)) + + d = datetime(2014, 7, 1, 0) + self.assertEqual(self.offset1.rollback(d), datetime(2014, 6, 30, 17)) + self.assertEqual(self.offset2.rollback(d), datetime(2014, 6, 30, 17)) + self.assertEqual(self.offset3.rollback(d), datetime(2014, 6, 30, 17)) + self.assertEqual(self.offset4.rollback(d), datetime(2014, 6, 30, 17)) + self.assertEqual(self.offset5.rollback(d), datetime(2014, 6, 30, 14, 30)) + self.assertEqual(self.offset6.rollback(d), d) + self.assertEqual(self.offset7.rollback(d), d) + + self.assertEqual(self._offset(5).rollback(self.d), self.d) + + def testRollback2(self): + self.assertEqual(self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)), + datetime(2014, 7, 4, 17, 0)) + + def testRollforward1(self): + self.assertEqual(self.offset1.rollforward(self.d), self.d) + self.assertEqual(self.offset2.rollforward(self.d), self.d) + self.assertEqual(self.offset3.rollforward(self.d), self.d) + self.assertEqual(self.offset4.rollforward(self.d), self.d) + self.assertEqual(self.offset5.rollforward(self.d), datetime(2014, 7, 1, 11, 0)) + self.assertEqual(self.offset6.rollforward(self.d), datetime(2014, 7, 1, 20, 0)) + self.assertEqual(self.offset7.rollforward(self.d), datetime(2014, 7, 1, 21, 30)) + + d = datetime(2014, 7, 1, 0) + self.assertEqual(self.offset1.rollforward(d), datetime(2014, 7, 1, 9)) + self.assertEqual(self.offset2.rollforward(d), datetime(2014, 7, 1, 9)) + self.assertEqual(self.offset3.rollforward(d), datetime(2014, 7, 1, 9)) + self.assertEqual(self.offset4.rollforward(d), datetime(2014, 7, 1, 9)) + self.assertEqual(self.offset5.rollforward(d), datetime(2014, 7, 1, 11)) + self.assertEqual(self.offset6.rollforward(d), d) + self.assertEqual(self.offset7.rollforward(d), d) + + self.assertEqual(self._offset(5).rollforward(self.d), self.d) + + def testRollforward2(self): + self.assertEqual(self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)), + datetime(2014, 7, 7, 9)) + + def test_roll_date_object(self): + offset = BusinessHour() + + dt = datetime(2014, 7, 6, 15, 0) + + result = offset.rollback(dt) + self.assertEqual(result, datetime(2014, 7, 4, 17)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2014, 7, 7, 9)) + + def test_normalize(self): + tests = [] + + tests.append((BusinessHour(normalize=True), + {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) + + tests.append((BusinessHour(-1, normalize=True), + {datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) + + tests.append((BusinessHour(1, normalize=True, start='17:00', end='04:00'), + {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) + + for offset, cases in tests: + for dt, expected in compat.iteritems(cases): + self.assertEqual(offset.apply(dt), expected) + + def test_onOffset(self): + tests = [] + + tests.append((BusinessHour(), + {datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False})) + + tests.append((BusinessHour(start='10:00', end='15:00'), + {datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False})) + + tests.append((BusinessHour(start='19:00', end='05:00'), + {datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False})) + + for offset, cases in tests: + for dt, expected in compat.iteritems(cases): + self.assertEqual(offset.onOffset(dt), expected) + + def test_opening_time(self): + tests = [] + + # opening time should be affected by sign of n, not by n's value and end + tests.append(([BusinessHour(), BusinessHour(n=2), BusinessHour(n=4), + BusinessHour(end='10:00'), BusinessHour(n=2, end='4:00'), + BusinessHour(n=4, end='15:00')], + {datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), datetime(2014, 7, 1, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), datetime(2014, 7, 1, 9)), + # if timestamp is on opening time, next opening time is as it is + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), datetime(2014, 7, 2, 9)), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), datetime(2014, 7, 4, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), datetime(2014, 7, 7, 9))})) + + tests.append(([BusinessHour(start='11:15'), BusinessHour(n=2, start='11:15'), + BusinessHour(n=3, start='11:15'), + BusinessHour(start='11:15', end='10:00'), + BusinessHour(n=2, start='11:15', end='4:00'), + BusinessHour(n=3, start='11:15', end='15:00')], + {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), datetime(2014, 6, 30, 11, 15)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15), datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), datetime(2014, 7, 3, 11, 15)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15))})) + + tests.append(([BusinessHour(-1), BusinessHour(n=-2), BusinessHour(n=-4), + BusinessHour(n=-1, end='10:00'), BusinessHour(n=-2, end='4:00'), + BusinessHour(n=-4, end='15:00')], + {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), datetime(2014, 7, 3, 9)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), datetime(2014, 7, 8, 9))})) + + tests.append(([BusinessHour(start='17:00', end='05:00'), + BusinessHour(n=3, start='17:00', end='03:00')], + {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), datetime(2014, 6, 30, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), datetime(2014, 7, 1, 17)), + datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), datetime(2014, 7, 3, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17), datetime(2014, 7, 7, 17)),})) + + tests.append(([BusinessHour(-1, start='17:00', end='05:00'), + BusinessHour(n=-2, start='17:00', end='03:00')], + {datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), datetime(2014, 7, 2, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17), datetime(2014, 7, 2, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), datetime(2014, 7, 7, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), datetime(2014, 7, 7, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), datetime(2014, 7, 8, 17))})) + + for offsets, cases in tests: + for offset in offsets: + for dt, (exp_next, exp_prev) in compat.iteritems(cases): + self.assertEqual(offset._next_opening_time(dt), exp_next) + self.assertEqual(offset._prev_opening_time(dt), exp_prev) + + def test_apply(self): + tests = [] + + tests.append((BusinessHour(), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) + + tests.append((BusinessHour(4), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) + + tests.append((BusinessHour(-1), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)})) + + tests.append((BusinessHour(-4), + {datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)})) + + tests.append((BusinessHour(start='13:00', end='16:00'), + {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) + + tests.append((BusinessHour(n=2, start='13:00', end='16:00'), + {datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)})) + + tests.append((BusinessHour(n=-1, start='13:00', end='16:00'), + {datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) + + tests.append((BusinessHour(n=-3, start='10:00', end='16:00'), + {datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)})) + + tests.append((BusinessHour(start='19:00', end='05:00'), + {datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)})) + + tests.append((BusinessHour(n=-1, start='19:00', end='05:00'), + {datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_large_n(self): + tests = [] + + tests.append((BusinessHour(40), # A week later + {datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)})) + + tests.append((BusinessHour(-25), # 3 days and 1 hour before + {datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + + tests.append((BusinessHour(28, start='21:00', end='02:00'), # 5 days and 3 hours later + {datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_nanoseconds(self): + tests = [] + + tests.append((BusinessHour(), + {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp('2014-07-04 16:00') + Nano(5), + Timestamp('2014-07-04 16:00') + Nano(5): Timestamp('2014-07-07 09:00') + Nano(5), + Timestamp('2014-07-04 16:00') - Nano(5): Timestamp('2014-07-04 17:00') - Nano(5) + })) + + tests.append((BusinessHour(-1), + {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp('2014-07-04 14:00') + Nano(5), + Timestamp('2014-07-04 10:00') + Nano(5): Timestamp('2014-07-04 09:00') + Nano(5), + Timestamp('2014-07-04 10:00') - Nano(5): Timestamp('2014-07-03 17:00') - Nano(5), + })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = self._offset() + offset2 = self._offset() + self.assertFalse(offset1 != offset2) + + def test_datetimeindex(self): + idx1 = DatetimeIndex(start='2014-07-04 15:00', end='2014-07-08 10:00', freq='BH') + idx2 = DatetimeIndex(start='2014-07-04 15:00', periods=12, freq='BH') + idx3 = DatetimeIndex(end='2014-07-08 10:00', periods=12, freq='BH') + expected = DatetimeIndex(['2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00', + '2014-07-07 10:00', '2014-07-07 11:00', '2014-07-07 12:00', + '2014-07-07 13:00', '2014-07-07 14:00', '2014-07-07 15:00', + '2014-07-07 16:00', '2014-07-08 09:00', '2014-07-08 10:00'], + freq='BH') + for idx in [idx1, idx2, idx3]: + tm.assert_index_equal(idx, expected) + + idx1 = DatetimeIndex(start='2014-07-04 15:45', end='2014-07-08 10:45', freq='BH') + idx2 = DatetimeIndex(start='2014-07-04 15:45', periods=12, freq='BH') + idx3 = DatetimeIndex(end='2014-07-08 10:45', periods=12, freq='BH') + + expected = DatetimeIndex(['2014-07-04 15:45', '2014-07-04 16:45', '2014-07-07 09:45', + '2014-07-07 10:45', '2014-07-07 11:45', '2014-07-07 12:45', + '2014-07-07 13:45', '2014-07-07 14:45', '2014-07-07 15:45', + '2014-07-07 16:45', '2014-07-08 09:45', '2014-07-08 10:45'], + freq='BH') + expected = idx1 + for idx in [idx1, idx2, idx3]: + tm.assert_index_equal(idx, expected) + + class TestCustomBusinessDay(Base): _multiprocess_can_split_ = True _offset = CDay diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 17edcd7504102..0218af63ca7d6 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -101,15 +101,15 @@ def test_timestamp_tz_arg(self): pytz.timezone('Europe/Brussels').normalize(p).tzinfo) def test_timestamp_tz_arg_dateutil(self): - import dateutil + from pandas.tslib import _dateutil_gettz as gettz from pandas.tslib import maybe_get_tz p = Period('1/1/2005', freq='M').to_timestamp(tz=maybe_get_tz('dateutil/Europe/Brussels')) - self.assertEqual(p.tz, dateutil.zoneinfo.gettz('Europe/Brussels')) + self.assertEqual(p.tz, gettz('Europe/Brussels')) def test_timestamp_tz_arg_dateutil_from_string(self): - import dateutil + from pandas.tslib import _dateutil_gettz as gettz p = Period('1/1/2005', freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - self.assertEqual(p.tz, dateutil.zoneinfo.gettz('Europe/Brussels')) + self.assertEqual(p.tz, gettz('Europe/Brussels')) def test_timestamp_nat_tz(self): t = Period('NaT', freq='M').to_timestamp() @@ -226,16 +226,29 @@ def test_period_constructor(self): i1 = Period(date(2007, 1, 1), freq='M') i2 = Period(datetime(2007, 1, 1), freq='M') + i3 = Period(np.datetime64('2007-01-01'), freq='M') + i4 = Period(np.datetime64('2007-01-01 00:00:00Z'), freq='M') + i5 = Period(np.datetime64('2007-01-01 00:00:00.000Z'), freq='M') self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + self.assertEqual(i1, i4) + self.assertEqual(i1, i5) i1 = Period('2007-01-01 09:00:00.001') expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') self.assertEqual(i1, expected) + expected = Period(np.datetime64('2007-01-01 09:00:00.001Z'), freq='L') + self.assertEqual(i1, expected) + i1 = Period('2007-01-01 09:00:00.00101') expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') self.assertEqual(i1, expected) + expected = Period(np.datetime64('2007-01-01 09:00:00.00101Z'), + freq='U') + self.assertEqual(i1, expected) + self.assertRaises(ValueError, Period, ordinal=200701) self.assertRaises(ValueError, Period, '2007-1-1', freq='X') @@ -434,7 +447,7 @@ def test_properties_weekly(self): assert_equal((w_date - 1).week, 52) assert_equal(w_date.days_in_month, 31) assert_equal(Period(freq='WK', year=2012, month=2, day=1).days_in_month, 29) - + def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) @@ -2105,6 +2118,7 @@ def test_range_slice_outofbounds(self): for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) empty = DataFrame(index=idx.__class__([], freq='D'), columns=['units']) + empty['units'] = empty['units'].astype('int64') tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index c4e642ffe43b0..c5ed8a1ac3e31 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -528,7 +528,9 @@ def test_secondary_y(self): ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True).right_ax + ax = ser.plot(secondary_y=True) + self.assertTrue(hasattr(ax, 'left_ax')) + self.assertFalse(hasattr(ax, 'right_ax')) fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] @@ -543,8 +545,12 @@ def test_secondary_y(self): plt.close(ax2.get_figure()) ax = ser2.plot() - ax2 = ser.plot(secondary_y=True).right_ax + ax2 = ser.plot(secondary_y=True) self.assertTrue(ax.get_yaxis().get_visible()) + self.assertFalse(hasattr(ax, 'left_ax')) + self.assertTrue(hasattr(ax, 'right_ax')) + self.assertTrue(hasattr(ax2, 'left_ax')) + self.assertFalse(hasattr(ax2, 'right_ax')) @slow def test_secondary_y_ts(self): @@ -552,7 +558,9 @@ def test_secondary_y_ts(self): idx = date_range('1/1/2000', periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) - ax = ser.plot(secondary_y=True).right_ax + ax = ser.plot(secondary_y=True) + self.assertTrue(hasattr(ax, 'left_ax')) + self.assertFalse(hasattr(ax, 'right_ax')) fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] @@ -577,7 +585,9 @@ def test_secondary_kde(self): import matplotlib.pyplot as plt ser = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True, kind='density').right_ax + ax = ser.plot(secondary_y=True, kind='density') + self.assertTrue(hasattr(ax, 'left_ax')) + self.assertFalse(hasattr(ax, 'right_ax')) fig = ax.get_figure() axes = fig.get_axes() self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'right') @@ -636,6 +646,38 @@ def test_mixed_freq_irregular_first(self): x2 = lines[1].get_xdata() assert_array_equal(x2, s1.index.asobject.values) + def test_mixed_freq_regular_first_df(self): + # GH 9852 + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + ax = s1.plot() + ax2 = s2.plot(style='g', ax=ax) + lines = ax2.get_lines() + idx1 = PeriodIndex(lines[0].get_xdata()) + idx2 = PeriodIndex(lines[1].get_xdata()) + self.assertTrue(idx1.equals(s1.index.to_period('B'))) + self.assertTrue(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assertEqual(left, pidx[0].ordinal) + self.assertEqual(right, pidx[-1].ordinal) + + @slow + def test_mixed_freq_irregular_first_df(self): + # GH 9852 + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + ax = s2.plot(style='g') + ax = s1.plot(ax=ax) + self.assertFalse(hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + def test_mixed_freq_hf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') @@ -890,7 +932,9 @@ def test_secondary_upsample(self): ax = high.plot(secondary_y=True) for l in ax.get_lines(): self.assertEqual(PeriodIndex(l.get_xdata()).freq, 'D') - for l in ax.right_ax.get_lines(): + self.assertTrue(hasattr(ax, 'left_ax')) + self.assertFalse(hasattr(ax, 'right_ax')) + for l in ax.left_ax.get_lines(): self.assertEqual(PeriodIndex(l.get_xdata()).freq, 'D') @slow diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index c338bbeae79c7..d7b1256329cc3 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -82,15 +82,17 @@ def test_resample_basic(self): name='index') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', how='mean', closed='right', label='right') + + exp_idx = date_range('1/1/2000', periods=4, freq='5min', name='index') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=date_range('1/1/2000', periods=4, freq='5min')) + index=exp_idx) assert_series_equal(result, expected) self.assertEqual(result.index.name, 'index') result = s.resample('5min', how='mean', closed='left', label='right') - expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], - index=date_range('1/1/2000 00:05', periods=3, - freq='5min')) + + exp_idx = date_range('1/1/2000 00:05', periods=3, freq='5min', name='index') + expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], index=exp_idx) assert_series_equal(result, expected) s = self.series @@ -115,7 +117,7 @@ def _ohlc(group): if isnull(group).all(): return np.repeat(np.nan, 4) return [group[0], group.max(), group.min(), group[-1]] - inds = date_range('1/1/2000', periods=4, freq='5min') + inds = date_range('1/1/2000', periods=4, freq='5min', name='index') for arg in args: if arg == 'ohlc': @@ -376,6 +378,16 @@ def test_resample_upsample(self): self.assertEqual(result.index.name, 'index') + def test_resample_extra_index_point(self): + # GH 9756 + index = DatetimeIndex(start='20150101', end='20150331', freq='BM') + expected = DataFrame({'A' : Series([21,41,63], index=index)}) + + index = DatetimeIndex(start='20150101', end='20150331', freq='B') + df = DataFrame({'A' : Series(range(len(index)),index=index)},dtype='int64') + result = df.resample('BM', how='last') + assert_frame_equal(result, expected) + def test_upsample_with_limit(self): rng = date_range('1/1/2000', periods=3, freq='5t') ts = Series(np.random.randn(len(rng)), rng) @@ -875,23 +887,23 @@ def test_resmaple_dst_anchor(self): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') df = DataFrame([5], index=dti) - assert_frame_equal(df.resample(rule='D', how='sum'), + assert_frame_equal(df.resample(rule='D', how='sum'), DataFrame([5], index=df.index.normalize())) df.resample(rule='MS', how='sum') assert_frame_equal(df.resample(rule='MS', how='sum'), - DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz='US/Eastern'))) dti = date_range('2013-09-30', '2013-11-02', freq='30Min', tz='Europe/Paris') values = range(dti.size) df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype='int64') how = {"a": "min", "b": "max", "c": "count"} - + assert_frame_equal(df.resample("W-MON", how=how)[["a", "b", "c"]], DataFrame({"a": [0, 48, 384, 720, 1056, 1394], "b": [47, 383, 719, 1055, 1393, 1586], "c": [48, 336, 336, 336, 338, 193]}, - index=date_range('9/30/2013', '11/4/2013', + index=date_range('9/30/2013', '11/4/2013', freq='W-MON', tz='Europe/Paris')), 'W-MON Frequency') @@ -899,7 +911,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 48, 720, 1394], "b": [47, 719, 1393, 1586], "c": [48, 672, 674, 193]}, - index=date_range('9/30/2013', '11/11/2013', + index=date_range('9/30/2013', '11/11/2013', freq='2W-MON', tz='Europe/Paris')), '2W-MON Frequency') @@ -907,7 +919,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, - index=date_range('9/1/2013', '11/1/2013', + index=date_range('9/1/2013', '11/1/2013', freq='MS', tz='Europe/Paris')), 'MS Frequency') @@ -915,7 +927,7 @@ def test_resmaple_dst_anchor(self): DataFrame({"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, - index=date_range('9/1/2013', '11/1/2013', + index=date_range('9/1/2013', '11/1/2013', freq='2MS', tz='Europe/Paris')), '2MS Frequency') @@ -1553,6 +1565,8 @@ def test_aggregate_with_nat(self): expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') dt_result = getattr(dt_grouped, func)() assert_series_equal(expected, dt_result) + # GH 9925 + self.assertEqual(dt_result.index.name, 'key') # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index b74a3a59d3bca..948a0be91b276 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -23,6 +23,8 @@ import pandas.util.testing as tm from numpy.random import rand, randn from pandas import _np_version_under1p8 +import pandas.compat as compat + iNaT = tslib.iNaT @@ -64,6 +66,13 @@ def test_construction(self): self.assertEqual(Timedelta(123072001000000).value, 123072001000000) self.assertTrue('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + # string conversion with/without leading zero + # GH 9570 + self.assertEqual(Timedelta('0:00:00'), timedelta(hours=0)) + self.assertEqual(Timedelta('00:00:00'), timedelta(hours=0)) + self.assertEqual(Timedelta('-1:00:00'), -timedelta(hours=1)) + self.assertEqual(Timedelta('-01:00:00'), -timedelta(hours=1)) + # more strings # GH 8190 self.assertEqual(Timedelta('1 h'), timedelta(hours=1)) @@ -302,51 +311,70 @@ class Other: def test_fields(self): + def check(value): + # that we are int/long like + self.assertTrue(isinstance(value, (int, compat.long))) + # compat to datetime.timedelta rng = to_timedelta('1 days, 10:11:12') - self.assertEqual(rng.days,1) - self.assertEqual(rng.seconds,10*3600+11*60+12) - self.assertEqual(rng.microseconds,0) - self.assertEqual(rng.nanoseconds,0) + self.assertEqual(rng.days, 1) + self.assertEqual(rng.seconds, 10*3600+11*60+12) + self.assertEqual(rng.microseconds, 0) + self.assertEqual(rng.nanoseconds, 0) self.assertRaises(AttributeError, lambda : rng.hours) self.assertRaises(AttributeError, lambda : rng.minutes) self.assertRaises(AttributeError, lambda : rng.milliseconds) + # GH 10050 + check(rng.days) + check(rng.seconds) + check(rng.microseconds) + check(rng.nanoseconds) + td = Timedelta('-1 days, 10:11:12') - self.assertEqual(abs(td),Timedelta('13:48:48')) + self.assertEqual(abs(td), Timedelta('13:48:48')) self.assertTrue(str(td) == "-1 days +10:11:12") - self.assertEqual(-td,Timedelta('0 days 13:48:48')) - self.assertEqual(-Timedelta('-1 days, 10:11:12').value,49728000000000) - self.assertEqual(Timedelta('-1 days, 10:11:12').value,-49728000000000) + self.assertEqual(-td, Timedelta('0 days 13:48:48')) + self.assertEqual(-Timedelta('-1 days, 10:11:12').value, 49728000000000) + self.assertEqual(Timedelta('-1 days, 10:11:12').value, -49728000000000) rng = to_timedelta('-1 days, 10:11:12.100123456') - self.assertEqual(rng.days,-1) - self.assertEqual(rng.seconds,10*3600+11*60+12) - self.assertEqual(rng.microseconds,100*1000+123) - self.assertEqual(rng.nanoseconds,456) + self.assertEqual(rng.days, -1) + self.assertEqual(rng.seconds, 10*3600+11*60+12) + self.assertEqual(rng.microseconds, 100*1000+123) + self.assertEqual(rng.nanoseconds, 456) self.assertRaises(AttributeError, lambda : rng.hours) self.assertRaises(AttributeError, lambda : rng.minutes) self.assertRaises(AttributeError, lambda : rng.milliseconds) # components tup = pd.to_timedelta(-1, 'us').components - self.assertEqual(tup.days,-1) - self.assertEqual(tup.hours,23) - self.assertEqual(tup.minutes,59) - self.assertEqual(tup.seconds,59) - self.assertEqual(tup.milliseconds,999) - self.assertEqual(tup.microseconds,999) - self.assertEqual(tup.nanoseconds,0) + self.assertEqual(tup.days, -1) + self.assertEqual(tup.hours, 23) + self.assertEqual(tup.minutes, 59) + self.assertEqual(tup.seconds, 59) + self.assertEqual(tup.milliseconds, 999) + self.assertEqual(tup.microseconds, 999) + self.assertEqual(tup.nanoseconds, 0) + + # GH 10050 + check(tup.days) + check(tup.hours) + check(tup.minutes) + check(tup.seconds) + check(tup.milliseconds) + check(tup.microseconds) + check(tup.nanoseconds) tup = Timedelta('-1 days 1 us').components - self.assertEqual(tup.days,-2) - self.assertEqual(tup.hours,23) - self.assertEqual(tup.minutes,59) - self.assertEqual(tup.seconds,59) - self.assertEqual(tup.milliseconds,999) - self.assertEqual(tup.microseconds,999) - self.assertEqual(tup.nanoseconds,0) + self.assertEqual(tup.days, -2) + self.assertEqual(tup.hours, 23) + self.assertEqual(tup.minutes, 59) + self.assertEqual(tup.seconds, 59) + self.assertEqual(tup.milliseconds, 999) + self.assertEqual(tup.microseconds, 999) + self.assertEqual(tup.nanoseconds, 0) def test_timedelta_range(self): @@ -607,7 +635,7 @@ def test_timedelta_ops(self): self.assertEqual(result, expected) result = td.median() - expected = to_timedelta('00:00:08') + expected = to_timedelta('00:00:09') self.assertEqual(result, expected) result = td.to_frame().median() @@ -634,6 +662,14 @@ def test_timedelta_ops(self): for op in ['skew','kurt','sem','var','prod']: self.assertRaises(TypeError, lambda : getattr(td,op)()) + # GH 10040 + # make sure NaT is properly handled by median() + s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) + self.assertEqual(s.diff().median(), timedelta(days=4)) + + s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')]) + self.assertEqual(s.diff().median(), timedelta(days=6)) + def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') @@ -942,6 +978,10 @@ def test_constructor_name(self): name='TEST') self.assertEqual(idx.name, 'TEST') + # GH10025 + idx2 = TimedeltaIndex(idx, name='something else') + self.assertEqual(idx2.name, 'something else') + def test_freq_conversion(self): # doc example diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 436a976c72e7e..8412ba8d4aad1 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -288,7 +288,7 @@ def test_indexing(self): self.assertRaises(KeyError, df.__getitem__, df.index[2],) def test_recreate_from_data(self): - freqs = ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'H', 'N', 'C'] + freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', 'C'] for f in freqs: org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) @@ -417,9 +417,9 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): _skip_if_windows_python_3() tm._skip_if_no_dateutil() - import dateutil + from pandas.tslib import _dateutil_gettz as gettz rng = date_range('20090415', '20090519', - tz=dateutil.zoneinfo.gettz('US/Eastern')) + tz=gettz('US/Eastern')) stamp = rng[0] dtval = stamp.to_pydatetime() @@ -791,7 +791,7 @@ def test_series_repr_nat(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') result = repr(series) - expected = ('0 1970-01-01 00:00:00\n' + expected = ('0 1970-01-01 00:00:00.000000\n' '1 1970-01-01 00:00:00.000001\n' '2 1970-01-01 00:00:00.000002\n' '3 NaT\n' @@ -1131,6 +1131,15 @@ def test_reindex_with_datetimes(self): result = ts[list(ts.index[5:10])] tm.assert_series_equal(result, expected) + def test_asfreq_keep_index_name(self): + # GH #9854 + index_name = 'bar' + index = pd.date_range('20130101',periods=20,name=index_name) + df = pd.DataFrame([x for x in range(20)],columns=['foo'],index=index) + + tm.assert_equal(index_name, df.index.name) + tm.assert_equal(index_name, df.asfreq('10D').index.name) + def test_promote_datetime_date(self): rng = date_range('1/1/2000', periods=20) ts = Series(np.random.randn(20), index=rng) @@ -1798,7 +1807,7 @@ def test_append_concat_tz_explicit_pytz(self): def test_append_concat_tz_dateutil(self): # GH 2938 tm._skip_if_no_dateutil() - from dateutil.zoneinfo import gettz as timezone + from pandas.tslib import _dateutil_gettz as timezone rng = date_range('5/8/2012 1:45', periods=10, freq='5T', tz='dateutil/US/Eastern') @@ -3338,6 +3347,29 @@ def test_date_range_bms_bug(self): ex_first = Timestamp('2000-01-03') self.assertEqual(rng[0], ex_first) + def test_date_range_businesshour(self): + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00'], freq='BH') + rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex(['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') + rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', + '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', + '2014-07-04 15:00', '2014-07-04 16:00', + '2014-07-07 09:00', '2014-07-07 10:00', '2014-07-07 11:00', + '2014-07-07 12:00', '2014-07-07 13:00', '2014-07-07 14:00', + '2014-07-07 15:00', '2014-07-07 16:00', + '2014-07-08 09:00', '2014-07-08 10:00', '2014-07-08 11:00', + '2014-07-08 12:00', '2014-07-08 13:00', '2014-07-08 14:00', + '2014-07-08 15:00', '2014-07-08 16:00'], freq='BH') + rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') + tm.assert_index_equal(idx, rng) + def test_string_index_series_name_converted(self): # #1644 df = DataFrame(np.random.randn(10, 4), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index e452ddee9d8db..341450f504e2a 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -14,6 +14,8 @@ import pandas.tseries.offsets as offsets import pandas.util.testing as tm from pandas.util.testing import assert_series_equal +import pandas.compat as compat + class TestTimestamp(tm.TestCase): @@ -369,6 +371,50 @@ def test_today(self): self.assertTrue(abs(ts_from_string_tz.tz_localize(None) - ts_from_method_tz.tz_localize(None)) < delta) + def test_fields(self): + + def check(value, equal): + # that we are int/long like + self.assertTrue(isinstance(value, (int, compat.long))) + self.assertEqual(value, equal) + + # GH 10050 + ts = Timestamp('2015-05-10 09:06:03.000100001') + check(ts.year, 2015) + check(ts.month, 5) + check(ts.day, 10) + check(ts.hour, 9) + check(ts.minute, 6) + check(ts.second, 3) + self.assertRaises(AttributeError, lambda : ts.millisecond) + check(ts.microsecond, 100) + check(ts.nanosecond, 1) + check(ts.dayofweek, 6) + check(ts.quarter, 2) + check(ts.dayofyear, 130) + check(ts.week, 19) + check(ts.daysinmonth, 31) + check(ts.daysinmonth, 31) + + def test_nat_fields(self): + # GH 10050 + ts = Timestamp('NaT') + self.assertTrue(np.isnan(ts.year)) + self.assertTrue(np.isnan(ts.month)) + self.assertTrue(np.isnan(ts.day)) + self.assertTrue(np.isnan(ts.hour)) + self.assertTrue(np.isnan(ts.minute)) + self.assertTrue(np.isnan(ts.second)) + self.assertTrue(np.isnan(ts.microsecond)) + self.assertTrue(np.isnan(ts.nanosecond)) + self.assertTrue(np.isnan(ts.dayofweek)) + self.assertTrue(np.isnan(ts.quarter)) + self.assertTrue(np.isnan(ts.dayofyear)) + self.assertTrue(np.isnan(ts.week)) + self.assertTrue(np.isnan(ts.daysinmonth)) + self.assertTrue(np.isnan(ts.days_in_month)) + + class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): bad_date_strings = ( diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 91e75da1b551c..624981c5536f5 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -3,14 +3,12 @@ """ import re -from datetime import timedelta - import numpy as np import pandas.tslib as tslib from pandas import compat -from pandas.core.common import (ABCSeries, is_integer, is_integer_dtype, - is_timedelta64_dtype, _values_from_object, - is_list_like, isnull, _ensure_object) +from pandas.core.common import (ABCSeries, is_integer_dtype, + is_timedelta64_dtype, is_list_like, + isnull, _ensure_object) def to_timedelta(arg, unit='ns', box=True, coerce=False): """ @@ -119,7 +117,7 @@ def _validate_timedelta_unit(arg): _short_search = re.compile( "^\s*(?P-?)\s*(?P\d*\.?\d*)\s*(?Pd|s|ms|us|ns)?\s*$",re.IGNORECASE) _full_search = re.compile( - "^\s*(?P-?)\s*(?P\d*\.?\d*)?\s*(days|d|day)?,?\s*\+?(?P
elements. """ - raise NotImplementedError + raise AbstractMethodError(self) def _parse_thead(self, table): """Return the header of a table. @@ -300,7 +301,7 @@ def _parse_thead(self, table): thead : node-like A