diff --git a/.coveragerc b/.coveragerc index 5b264a626abfa..3f630aa6cf8f5 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,7 @@ # .coveragerc to control coverage.py [run] branch = False +omit = */tests/* [report] # Regexes for lines to exclude from consideration @@ -23,4 +24,4 @@ exclude_lines = ignore_errors = False [html] -directory = coverage_html_report \ No newline at end of file +directory = coverage_html_report diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..a1fbece3284ec --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,63 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, and in the interest of +fostering an open and welcoming community, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating +documentation, submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, ethnicity, age, religion, or nationality. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery +* Personal attacks +* Trolling or insulting/derogatory comments +* Public or private harassment +* Publishing other's private information, such as physical or electronic + addresses, without explicit permission +* Other unethical or unprofessional conduct + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +By adopting this Code of Conduct, project maintainers commit themselves to +fairly and consistently applying these principles to every aspect of managing +this project. Project maintainers who do not follow or enforce the Code of +Conduct may be permanently removed from the project team. + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. + +A working group of community members is committed to promptly addressing any +reported issues. The working group is made up of pandas contributors and users. +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the working group by e-mail (pandas-coc@googlegroups.com). +Messages sent to this e-mail address will not be publicly visible but only to +the working group members. The working group currently includes + +- Safia Abdalla +- Tom Augspurger +- Joris Van den Bossche +- Camille Scott +- Nathaniel Smith + +All complaints will be reviewed and investigated and will result in a response +that is deemed necessary and appropriate to the circumstances. Maintainers are +obligated to maintain confidentiality with regard to the reporter of an +incident. + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 1.3.0, available at +[http://contributor-covenant.org/version/1/3/0/][version], +and the [Swift Code of Conduct][swift]. + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/3/0/ +[swift]: https://swift.org/community/#code-of-conduct + diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000000000..95729f845ff5c --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,24 @@ +Contributing to pandas +====================== + +Whether you are a novice or experienced software developer, all contributions and suggestions are welcome! + +Our main contribution docs can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst), but if you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant places in the docs for further information. + +Getting Started +--------------- +If you are looking to contribute to the *pandas* codebase, the best place to start is the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues). This is also a great place for filing bug reports and making suggestions for ways in which we can improve the code and documentation. + +If you have additional questions, feel free to ask them on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). Further information can also be found in our [Getting Started](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#where-to-start) section of our main contribution doc. + +Filing Issues +------------- +If you notice a bug in the code or in docs or have suggestions for how we can improve either, feel free to create an issue on the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) using [GitHub's "issue" form](https://github.com/pandas-dev/pandas/issues/new). The form contains some questions that will help us best address your issue. For more information regarding how to file issues against *pandas*, please refer to the [Bug reports and enhancement requests](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#bug-reports-and-enhancement-requests) section of our main contribution doc. + +Contributing to the Codebase +---------------------------- +The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas), so you will need to use [Git](http://git-scm.com/) to clone the project and make changes to the codebase. Once you have obtained a copy of the code, you should create a development environment that is separate from your existing Python environment so that you can make and test changes without compromising your own work environment. For more information, please refer to our [Working with the code](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#working-with-the-code) section of our main contribution docs. + +Before submitting your changes for review, make sure to check that your changes do not break any tests. You can find more information about our test suites can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#test-driven-development-code-writing). We also have guidelines regarding coding style that will be enforced during testing. Details about coding style can be found [here](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#code-standards). + +Once your changes are ready to be submitted, make sure to push your changes to GitHub before creating a pull request. Details about how to do that can be found in the [Contributing your changes to pandas](https://github.com/pandas-dev/pandas/blob/master/doc/source/contributing.rst#contributing-your-changes-to-pandas) section of our main contribution docs. We will review your changes, and you will most likely be asked to make additional changes before it is finally ready to merge. However, once it's ready, we will merge it, and you will have successfully contributed to the codebase! diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000000000..e33835c462511 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,29 @@ +#### Code Sample, a copy-pastable example if possible + +```python +# Your code here + +``` +#### Problem description + +[this should explain **why** the current behaviour is a problem and why the expected output is a better solution.] + +**Note**: We receive a lot of issues on our GitHub tracker, so it is very possible that your issue has been posted before. Please check first before submitting so that we do not have to handle and close duplicates! + +**Note**: Many problems can be resolved by simply upgrading `pandas` to the latest version. Before submitting, please check if that solution works for you. If possible, you may want to check if `master` addresses this issue, but that is not necessary. + +For documentation-related issues, you can check the latest versions of the docs on `master` here: + +https://pandas-docs.github.io/pandas-docs-travis/ + +If the issue has not been resolved there, go ahead and file it in the issue tracker. + +#### Expected Output + +#### Output of ``pd.show_versions()`` + +
+ +[paste the output of ``pd.show_versions()`` here below this line] + +
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000..4e1e9ce017408 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,4 @@ +- [ ] closes #xxxx +- [ ] tests added / passed +- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` +- [ ] whatsnew entry diff --git a/.gitignore b/.gitignore index d33df2df6e548..96b1f945870de 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *$ *.bak *flymake* +*.iml *.kdev4 *.log *.swp @@ -18,6 +19,9 @@ .vagrant .noseids .ipynb_checkpoints +.tags +.cache/ +.vscode/ # Compiled source # ################### @@ -26,6 +30,7 @@ *.class *.dll *.exe +*.pxi *.o *.py[ocd] *.so @@ -43,6 +48,7 @@ dist # Egg metadata *.egg-info .eggs +.pypirc # tox testing tool .tox @@ -53,6 +59,9 @@ dist **/wheelhouse/* # coverage .coverage +coverage.xml +coverage_html_report +*.pytest_cache # OS generated files # ###################### @@ -66,6 +75,7 @@ Thumbs.db # Data files # ############## *.dta +*.xpt *.h5 pandas/io/*.dat pandas/io/*.json @@ -79,9 +89,12 @@ scikits *.c *.cpp -# Performance Testing # -####################### -asv_bench/ +# Unit / Performance Testing # +############################## +asv_bench/env/ +asv_bench/html/ +asv_bench/results/ +asv_bench/pandas/ # Documentation generated files # ################################# @@ -93,3 +106,7 @@ doc/source/index.rst doc/build/html/index.html # Windows specific leftover: doc/tmp.sv +doc/source/styled.xlsx +doc/source/templates/ +env/ +doc/source/savefig/ diff --git a/.pep8speaks.yml b/.pep8speaks.yml new file mode 100644 index 0000000000000..fda26d87bf7f6 --- /dev/null +++ b/.pep8speaks.yml @@ -0,0 +1,12 @@ +# File : .pep8speaks.yml + +scanner: + diff_only: True # If True, errors caused by only the patch are shown + +pycodestyle: + max-line-length: 79 + ignore: # Errors and warnings to ignore + - E402, # module level import not at top of file + - E731, # do not assign a lambda expression, use a def + - E741, # do not use variables named 'l', 'O', or 'I' + - W503 # line break before binary operator diff --git a/.travis.yml b/.travis.yml index 4e46fb7ad85ca..2d2a0bc019c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,190 +1,138 @@ - +sudo: false language: python +# Default Python version is usually 2.7 +python: 3.5 -env: +# To turn off cached cython files and compiler cache +# set NOCACHE-true +# To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run +# travis cache --delete inside the project directory from the travis command line client +# The cache directories will be deleted if anything in ci/ changes in a commit +cache: + ccache: true + directories: + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache +env: global: - # scatterci API key - #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" - # ironcache API key - - secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" - - secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" - # pandas-docs-bot GH - - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" + # create a github personal access token + # cd pandas-dev/pandas + # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas + - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" git: # for cloning - depth: 100 + depth: 1000 matrix: fast_finish: true + exclude: + # Exclude the default Python 3.5 build + - python: 3.5 include: - - python: 2.6 - env: - - JOB_NAME: "26_nslow_nnet" - - NOSE_ARGS="not slow and not network and not disabled" - - CLIPBOARD=xclip - - LOCALE_OVERRIDE="it_IT.UTF-8" - - BUILD_TYPE=conda - - INSTALL_TEST=true - - python: 2.7 - env: - - JOB_NAME: "27_slow_nnet_LOCALE" - - NOSE_ARGS="slow and not network and not disabled" - - LOCALE_OVERRIDE="zh_CN.GB18030" - - FULL_DEPS=true - - JOB_TAG=_LOCALE - - BUILD_TYPE=conda - - python: 2.7 - env: - - JOB_NAME: "27_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD_GUI=gtk2 - - BUILD_TYPE=conda - - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - - python: 3.4 + - os: osx + language: generic env: - - JOB_NAME: "34_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 3.5 + - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty env: - - JOB_NAME: "35_nslow" - - NOSE_ARGS="not slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 3.3 + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty env: - - JOB_NAME: "33_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 2.7 + - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true + addons: + apt: + packages: + - language-pack-zh-hans + - dist: trusty env: - - JOB_NAME: "27_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=conda - - python: 3.4 + - JOB="2.7, lint" ENV_FILE="ci/travis-27.yaml" TEST_ARGS="--skip-slow" LINT=true + addons: + apt: + packages: + - python-gtk2 + - dist: trusty env: - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 2.7 + - JOB="3.6, coverage" ENV_FILE="ci/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true + # In allow_failures + - dist: trusty env: - - JOB_NAME: "27_build_test_conda" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=conda - - BUILD_TEST=true - - python: 2.7 + - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true + # In allow_failures + - dist: trusty env: - - JOB_NAME: "27_build_test_pydata" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=pydata - - BUILD_TEST=true - - python: 2.7 + - JOB="3.6, NumPy dev" ENV_FILE="ci/travis-36-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + addons: + apt: + packages: + - xsel + # In allow_failures + - dist: trusty env: - - JOB_NAME: "27_numpy_master" - - JOB_TAG=_NUMPY_DEV_master - - NOSE_ARGS="not slow and not network and not disabled" - - NUMPY_BUILD=master - - BUILD_TYPE=pydata - - PANDAS_TESTING_MODE="deprecate" + - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true allow_failures: - - python: 3.3 - env: - - JOB_NAME: "33_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 2.7 + - dist: trusty env: - - JOB_NAME: "27_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=conda - - python: 3.4 + - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true + - dist: trusty env: - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - BUILD_TYPE=conda - - python: 2.7 + - JOB="3.6, NumPy dev" ENV_FILE="ci/travis-36-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + addons: + apt: + packages: + - xsel + - dist: trusty env: - - JOB_NAME: "27_numpy_master" - - JOB_TAG=_NUMPY_DEV_master - - NOSE_ARGS="not slow and not network and not disabled" - - NUMPY_BUILD=master - - BUILD_TYPE=pydata - - PANDAS_TESTING_MODE="deprecate" - - python: 2.7 - env: - - JOB_NAME: "27_build_test_conda" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=conda - - BUILD_TEST=true - - python: 2.7 - env: - - JOB_NAME: "27_build_test_pydata" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TYPE=pydata - - BUILD_TEST=true + - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true before_install: - echo "before_install" - - echo $VIRTUAL_ENV - - export PATH="$HOME/miniconda/bin:$PATH" - - sudo apt-get install ccache + # set non-blocking IO on travis + # https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024 + - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);' + - source ci/travis_process_gbq_encryption.sh + - export PATH="$HOME/miniconda3/bin:$PATH" - df -h - - date - pwd - uname -a - - python -V - - ci/before_install.sh - # Xvfb stuff for clipboard functionality; see the travis-ci documentation - - export DISPLAY=:99.0 - - sh -e /etc/init.d/xvfb start + - git --version + - git tag install: - - echo "install" - - ci/prep_ccache.sh - - ci/install_${BUILD_TYPE}.sh - - ci/submit_ccache.sh + - echo "install start" + - ci/prep_cython_cache.sh + - ci/install_travis.sh + - ci/submit_cython_cache.sh + - echo "install done" before_script: - - mysql -e 'create database pandas_nosetest;' - - psql -c 'create database pandas_nosetest;' -U postgres + - ci/install_db_travis.sh + - export DISPLAY=":99.0" + - ci/before_script_travis.sh script: - - echo "script" - - ci/run_build_docs.sh & - - ci/script.sh -# nothing here, or failed tests won't fail travis + - echo "script start" + - ci/run_build_docs.sh + - ci/script_single.sh + - ci/script_multi.sh + - ci/lint.sh + - echo "checking imports" + - source activate pandas && python ci/check_imports.py + - echo "script done" + +after_success: + - ci/upload_coverage.sh after_script: - - ci/install_test.sh - - if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi - - source activate pandas && ci/print_versions.py - - ci/print_skipped.py /tmp/nosetests.xml - - ci/after_script.sh + - echo "after_script start" + - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - if [ -e /tmp/single.xml ]; then + ci/print_skipped.py /tmp/single.xml; + fi + - if [ -e /tmp/multiple.xml ]; then + ci/print_skipped.py /tmp/multiple.xml; + fi + - echo "after_script done" diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000000000..dcaaea101f4c8 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,57 @@ +About the Copyright Holders +=========================== + +* Copyright (c) 2008-2011 AQR Capital Management, LLC + + AQR Capital Management began pandas development in 2008. Development was + led by Wes McKinney. AQR released the source under this license in 2009. +* Copyright (c) 2011-2012, Lambda Foundry, Inc. + + Wes is now an employee of Lambda Foundry, and remains the pandas project + lead. +* Copyright (c) 2011-2012, PyData Development Team + + The PyData Development Team is the collection of developers of the PyData + project. This includes all of the PyData sub-projects, including pandas. The + core team that coordinates development on GitHub can be found here: + http://github.com/pydata. + +Full credits for pandas contributors can be found in the documentation. + +Our Copyright Policy +==================== + +PyData uses a shared copyright model. Each contributor maintains copyright +over their contributions to PyData. However, it is important to note that +these contributions are typically only changes to the repositories. Thus, +the PyData source code, in its entirety, is not the copyright of any single +person or institution. Instead, it is the collective copyright of the +entire PyData Development Team. If individual contributors want to maintain +a record of what changes/contributions they have specific copyright on, +they should indicate their copyright in the commit message of the change +when they commit the change to one of the PyData repositories. + +With this in mind, the following banner should be used in any source code +file to indicate the copyright and license terms: + +``` +#----------------------------------------------------------------------------- +# Copyright (c) 2012, PyData Development Team +# All rights reserved. +# +# Distributed under the terms of the BSD Simplified License. +# +# The full license is in the LICENSE file, distributed with this software. +#----------------------------------------------------------------------------- +``` + +Other licenses can be found in the LICENSES directory. + +License +======= + +pandas is distributed under a 3-clause ("Simplified" or "New") BSD +license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have +BSD-compatible licenses, are included. Their licenses follow the pandas +license. + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 284ac2fc5b169..0000000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,640 +0,0 @@ -Contributing to pandas -====================== - -Where to start? ---------------- - -All contributions, bug reports, bug fixes, documentation improvements, -enhancements and ideas are welcome. - -If you are simply looking to start working with the *pandas* codebase, -navigate to the [GitHub "issues" -tab](https://github.com/pydata/pandas/issues) and start looking through -interesting issues. There are a number of issues listed under -[Docs](https://github.com/pydata/pandas/issues?labels=Docs&sort=updated&state=open) -and [Difficulty -Novice](https://github.com/pydata/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) -where you could start out. - -Or maybe through using *pandas* you have an idea of you own or are -looking for something in the documentation and thinking 'this can be -improved'...you can do something about it! - -Feel free to ask questions on [mailing -list](https://groups.google.com/forum/?fromgroups#!forum/pydata) - -Bug Reports/Enhancement Requests --------------------------------- - -Bug reports are an important part of making *pandas* more stable. Having -a complete bug report will allow others to reproduce the bug and provide -insight into fixing. Since many versions of *pandas* are supported, -knowing version information will also identify improvements made since -previous versions. Often trying the bug-producing code out on the -*master* branch is a worthwhile exercise to confirm the bug still -exists. It is also worth searching existing bug reports and pull -requests to see if the issue has already been reported and/or fixed. - -Bug reports must: - -1. Include a short, self-contained Python snippet reproducing the - problem. You can have the code formatted nicely by using [GitHub - Flavored - Markdown](http://github.github.com/github-flavored-markdown/): : - - ```python - >>> from pandas import DataFrame - >>> df = DataFrame(...) - ... - ``` - -2. Include the full version string of *pandas* and its dependencies. In - recent (\>0.12) versions of *pandas* you can use a built in - function: : - - >>> from pandas.util.print_versions import show_versions - >>> show_versions() - - and in 0.13.1 onwards: : - - >>> pd.show_versions() - -3. Explain why the current behavior is wrong/not desired and what you - expect instead. - -The issue will then show up to the *pandas* community and be open to -comments/ideas from others. - -Working with the code ---------------------- - -Now that you have an issue you want to fix, enhancement to add, or -documentation to improve, you need to learn how to work with GitHub and -the *pandas* code base. - -### Version Control, Git, and GitHub - -To the new user, working with Git is one of the more daunting aspects of -contributing to *pandas*. It can very quickly become overwhelming, but -sticking to the guidelines below will make the process straightforward -and will work without much trouble. As always, if you are having -difficulties please feel free to ask for help. - -The code is hosted on [GitHub](https://www.github.com/pydata/pandas). To -contribute you will need to sign up for a [free GitHub -account](https://github.com/signup/free). We use -[Git](http://git-scm.com/) for version control to allow many people to -work together on the project. - -Some great resources for learning git: - -- the [GitHub help pages](http://help.github.com/). -- the [NumPy's - documentation](http://docs.scipy.org/doc/numpy/dev/index.html). -- Matthew Brett's - [Pydagogue](http://matthew-brett.github.com/pydagogue/). - -### Getting Started with Git - -[GitHub has instructions](http://help.github.com/set-up-git-redirect) -for installing git, setting up your SSH key, and configuring git. All -these steps need to be completed before working seamlessly with your -local repository and GitHub. - -### Forking - -You will need your own fork to work on the code. Go to the [pandas -project page](https://github.com/pydata/pandas) and hit the *fork* -button. You will want to clone your fork to your machine: : - - git clone git@github.com:your-user-name/pandas.git pandas-yourname - cd pandas-yourname - git remote add upstream git://github.com/pydata/pandas.git - -This creates the directory pandas-yourname and connects your repository -to the upstream (main project) *pandas* repository. - -You will also need to hook up Travis-CI to your GitHub repository so the -suite is automatically run when a Pull Request is submitted. -Instructions are -[here](http://about.travis-ci.org/docs/user/getting-started/). - -### Creating a Branch - -You want your master branch to reflect only production-ready code, so -create a feature branch for making your changes. For example: - - git branch shiny-new-feature - git checkout shiny-new-feature - -The above can be simplified to: - - git checkout -b shiny-new-feature - -This changes your working directory to the shiny-new-feature branch. -Keep any changes in this branch specific to one bug or feature so it is -clear what the branch brings to *pandas*. You can have many -shiny-new-features and switch in between them using the git checkout -command. - -### Creating a Development Environment - -An easy way to create a *pandas* development environment is as follows. - -- Install either Install Anaconda \ or - Install miniconda \ -- Make sure that you have - cloned the repository \ -- `cd` to the pandas source directory - -Tell `conda` to create a new environment, named `pandas_dev`, or any -name you would like for this environment by running: - - conda create -n pandas_dev --file ci/requirements_dev.txt - -For a python 3 environment - - conda create -n pandas_dev python=3 --file ci/requirements_dev.txt - -If you are on `windows`, then you will need to install the compiler -linkages: - - conda install -n pandas_dev libpython - -This will create the new environment, and not touch any of your existing -environments, nor any existing python installation. It will install all -of the basic dependencies of *pandas*, as well as the development and -testing tools. If you would like to install other dependencies, you can -install them as follows: - - conda install -n pandas_dev -c pandas pytables scipy - -To install *all* pandas dependencies you can do the following: - - conda install -n pandas_dev -c pandas --file ci/requirements_all.txt - -To work in this environment, `activate` it as follows: - - activate pandas_dev - -At which point, the prompt will change to indicate you are in the new -development environment. - -> **note** -> -> The above syntax is for `windows` environments. To work on -> `macosx/linux`, use: -> -> source activate pandas_dev - -To view your environments: - - conda info -e - -To return to you home root environment: - - deactivate - -See the full `conda` docs [here](http://conda.pydata.org/docs). - -At this point you can easily do an *in-place* install, as detailed in -the next section. - -### Making changes - -Before making your code changes, it is often necessary to build the code -that was just checked out. There are two primary methods of doing this. - -1. The best way to develop *pandas* is to build the C extensions - in-place by running: - - python setup.py build_ext --inplace - - If you startup the Python interpreter in the *pandas* source - directory you will call the built C extensions - -2. Another very common option is to do a `develop` install of *pandas*: - - python setup.py develop - - This makes a symbolic link that tells the Python interpreter to - import *pandas* from your development directory. Thus, you can - always be using the development version on your system without being - inside the clone directory. - -Contributing to the documentation ---------------------------------- - -If you're not the developer type, contributing to the documentation is -still of huge value. You don't even have to be an expert on *pandas* to -do so! Something as simple as rewriting small passages for clarity as -you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -Actually, there are sections of the docs that are worse off by being -written by experts. If something in the docs doesn't make sense to you, -updating the relevant section after you figure it out is a simple way to -ensure it will help the next person. - -### About the pandas documentation - -The documentation is written in **reStructuredText**, which is almost -like writing in plain English, and built using -[Sphinx](http://sphinx.pocoo.org/). The Sphinx Documentation has an -excellent [introduction to reST](http://sphinx.pocoo.org/rest.html). -Review the Sphinx docs to perform more complex changes to the -documentation as well. - -Some other important things to know about the docs: - -- The *pandas* documentation consists of two parts: the docstrings in - the code itself and the docs in this folder `pandas/doc/`. - - The docstrings provide a clear explanation of the usage of the - individual functions, while the documentation in this folder - consists of tutorial-like overviews per topic together with some - other information (what's new, installation, etc). - -- The docstrings follow the **Numpy Docstring Standard** which is used - widely in the Scientific Python community. This standard specifies - the format of the different sections of the docstring. See [this - document](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) - for a detailed explanation, or look at some of the existing - functions to extend it in a similar manner. -- The tutorials make heavy use of the [ipython - directive](http://matplotlib.org/sampledoc/ipython_directive.html) - sphinx extension. This directive lets you put code in the - documentation which will be run during the doc build. For example: - - .. ipython:: python - - x = 2 - x**3 - - will be rendered as - - In [1]: x = 2 - - In [2]: x**3 - Out[2]: 8 - - This means that almost all code examples in the docs are always run - (and the output saved) during the doc build. This way, they will - always be up to date, but it makes the doc building a bit more - complex. - -### How to build the pandas documentation - -#### Requirements - -To build the *pandas* docs there are some extra requirements: you will -need to have `sphinx` and `ipython` installed. -[numpydoc](https://github.com/numpy/numpydoc) is used to parse the -docstrings that follow the Numpy Docstring Standard (see above), but you -don't need to install this because a local copy of `numpydoc` is -included in the *pandas* source code. - -It is easiest to -create a development environment \, then -install: - - conda install -n pandas_dev sphinx ipython - -Furthermore, it is recommended to have all [optional -dependencies](http://pandas.pydata.org/pandas-docs/dev/install.html#optional-dependencies) -installed. This is not strictly necessary, but be aware that you will -see some error messages. Because all the code in the documentation is -executed during the doc build, the examples using this optional -dependencies will generate errors. Run `pd.show_versions()` to get an -overview of the installed version of all dependencies. - -> **warning** -> -> Sphinx version \>= 1.2.2 or the older 1.1.3 is required. - -#### Building the documentation - -So how do you build the docs? Navigate to your local the folder -`pandas/doc/` directory in the console and run: - - python make.py html - -And then you can find the html output in the folder -`pandas/doc/build/html/`. - -The first time it will take quite a while, because it has to run all the -code examples in the documentation and build all generated docstring -pages. In subsequent evocations, sphinx will try to only build the pages -that have been modified. - -If you want to do a full clean build, do: - - python make.py clean - python make.py build - -Starting with 0.13.1 you can tell `make.py` to compile only a single -section of the docs, greatly reducing the turn-around time for checking -your changes. You will be prompted to delete .rst files that aren't -required, since the last committed version can always be restored from -git. - - #omit autosummary and API section - python make.py clean - python make.py --no-api - - # compile the docs with only a single - # section, that which is in indexing.rst - python make.py clean - python make.py --single indexing - -For comparison, a full documentation build may take 10 minutes. a -`-no-api` build may take 3 minutes and a single section may take 15 -seconds. However, subsequent builds only process portions you changed. -Now, open the following file in a web browser to see the full -documentation you just built: - - pandas/docs/build/html/index.html - -And you'll have the satisfaction of seeing your new and improved -documentation! - -Contributing to the code base ------------------------------ - -### Code Standards - -*pandas* uses the [PEP8](http://www.python.org/dev/peps/pep-0008/) -standard. There are several tools to ensure you abide by this standard. - -We've written a tool to check that your commits are PEP8 great, [pip -install pep8radius](https://github.com/hayd/pep8radius). Look at PEP8 -fixes in your branch vs master with: - - pep8radius master --diff` and make these changes with `pep8radius master --diff --in-place` - -Alternatively, use [flake8](http://pypi.python.org/pypi/flake8) tool for -checking the style of your code. Additional standards are outlined on -the [code style wiki -page](https://github.com/pydata/pandas/wiki/Code-Style-and-Conventions). - -Please try to maintain backward-compatibility. *Pandas* has lots of -users with lots of existing code, so don't break it if at all possible. -If you think breakage is required clearly state why as part of the Pull -Request. Also, be careful when changing method signatures and add -deprecation warnings where needed. - -### Test-driven Development/Writing Code - -*Pandas* is serious about [Test-driven Development -(TDD)](http://en.wikipedia.org/wiki/Test-driven_development). This -development process "relies on the repetition of a very short -development cycle: first the developer writes an (initially failing) -automated test case that defines a desired improvement or new function, -then produces the minimum amount of code to pass that test." So, before -actually writing any code, you should write your tests. Often the test -can be taken from the original GitHub issue. However, it is always worth -considering additional use cases and writing corresponding tests. - -Adding tests is one of the most common requests after code is pushed to -*pandas*. It is worth getting in the habit of writing tests ahead of -time so this is never an issue. - -Like many packages, *pandas* uses the [Nose testing -system](http://somethingaboutorange.com/mrl/projects/nose/) and the -convenient extensions in -[numpy.testing](http://docs.scipy.org/doc/numpy/reference/routines.testing.html). - -#### Writing tests - -All tests should go into the *tests* subdirectory of the specific -package. There are probably many examples already there and looking to -these for inspiration is suggested. If you test requires working with -files or network connectivity there is more information on the [testing -page](https://github.com/pydata/pandas/wiki/Testing) of the wiki. - -The `pandas.util.testing` module has many special `assert` functions -that make it easier to make statements about whether Series or DataFrame -objects are equivalent. The easiest way to verify that your code is -correct is to explicitly construct the result you expect, then compare -the actual result to the expected correct result: - - def test_pivot(self): - data = { - 'index' : ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns' : ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values' : [1., 2., 3., 3., 2., 1.] - } - - frame = DataFrame(data) - pivoted = frame.pivot(index='index', columns='columns', values='values') - - expected = DataFrame({ - 'One' : {'A' : 1., 'B' : 2., 'C' : 3.}, - 'Two' : {'A' : 1., 'B' : 2., 'C' : 3.} - }) - - assert_frame_equal(pivoted, expected) - -#### Running the test suite - -The tests can then be run directly inside your git clone (without having -to install *pandas*) by typing:: - - nosetests pandas - -The tests suite is exhaustive and takes around 20 minutes to run. Often -it is worth running only a subset of tests first around your changes -before running the entire suite. This is done using one of the following -constructs: - - nosetests pandas/tests/[test-module].py - nosetests pandas/tests/[test-module].py:[TestClass] - nosetests pandas/tests/[test-module].py:[TestClass].[test_method] - -#### Running the performance test suite - -Performance matters and it is worth considering that your code has not -introduced performance regressions. Currently *pandas* uses the [vbench -library](https://github.com/pydata/vbench) to enable easy monitoring of -the performance of critical *pandas* operations. These benchmarks are -all found in the `pandas/vb_suite` directory. vbench currently only -works on python2. - -To install vbench: - - pip install git+https://github.com/pydata/vbench - -Vbench also requires sqlalchemy, gitpython, and psutil which can all be -installed using pip. If you need to run a benchmark, change your -directory to the *pandas* root and run: - - ./test_perf.sh -b master -t HEAD - -This will checkout the master revision and run the suite on both master -and your commit. Running the full test suite can take up to one hour and -use up to 3GB of RAM. Usually it is sufficient to past a subset of the -results in to the Pull Request to show that the committed changes do not -cause unexpected performance regressions. - -You can run specific benchmarks using the *-r* flag which takes a -regular expression. - -See the [performance testing -wiki](https://github.com/pydata/pandas/wiki/Performance-Testing) for -information on how to write a benchmark. - -### Documenting your code - -Changes should be reflected in the release notes located in -doc/source/whatsnew/vx.y.z.txt. This file contains an ongoing change log -for each release. Add an entry to this file to document your fix, -enhancement or (unavoidable) breaking change. Make sure to include the -GitHub issue number when adding your entry. - -If your code is an enhancement, it is most likely necessary to add usage -examples to the existing documentation. This can be done following the -section regarding documentation. - -Contributing your changes to *pandas* -------------------------------------- - -### Committing your code - -Keep style fixes to a separate commit to make your PR more readable. - -Once you've made changes, you can see them by typing: - - git status - -If you've created a new file, it is not being tracked by git. Add it by -typing : - - git add path/to/file-to-be-added.py - -Doing 'git status' again should give something like : - - # On branch shiny-new-feature - # - # modified: /relative/path/to/file-you-added.py - # - -Finally, commit your changes to your local repository with an -explanatory message. An informal commit message format is in effect for -the project. Please try to adhere to it. Here are some common prefixes -along with general guidelines for when to use them: - -> - ENH: Enhancement, new functionality -> - BUG: Bug fix -> - DOC: Additions/updates to documentation -> - TST: Additions/updates to tests -> - BLD: Updates to the build process/scripts -> - PERF: Performance improvement -> - CLN: Code cleanup - -The following defines how a commit message should be structured. Please -reference the relevant GitHub issues in your commit message using GH1234 -or \#1234. Either style is fine, but the former is generally preferred: - -> - a subject line with \< 80 chars. -> - One blank line. -> - Optionally, a commit message body. - -Now you can commit your changes in your local repository: - - git commit -m - -If you have multiple commits, it is common to want to combine them into -one commit, often referred to as "squashing" or "rebasing". This is a -common request by package maintainers when submitting a Pull Request as -it maintains a more compact commit history. To rebase your commits: - - git rebase -i HEAD~# - -Where \# is the number of commits you want to combine. Then you can pick -the relevant commit message and discard others. - -### Pushing your changes - -When you want your changes to appear publicly on your GitHub page, push -your forked feature branch's commits : - - git push origin shiny-new-feature - -Here origin is the default name given to your remote repository on -GitHub. You can see the remote repositories : - - git remote -v - -If you added the upstream repository as described above you will see -something like : - - origin git@github.com:yourname/pandas.git (fetch) - origin git@github.com:yourname/pandas.git (push) - upstream git://github.com/pydata/pandas.git (fetch) - upstream git://github.com/pydata/pandas.git (push) - -Now your code is on GitHub, but it is not yet a part of the *pandas* -project. For that to happen, a Pull Request needs to be submitted on -GitHub. - -### Review your code - -When you're ready to ask for a code review, you will file a Pull -Request. Before you do, again make sure you've followed all the -guidelines outlined in this document regarding code style, tests, -performance tests, and documentation. You should also double check your -branch changes against the branch it was based off of: - -1. Navigate to your repository on - GitHub--. -2. Click on Branches. -3. Click on the Compare button for your feature branch. -4. Select the base and compare branches, if necessary. This will be - master and shiny-new-feature, respectively. - -### Finally, make the Pull Request - -If everything looks good you are ready to make a Pull Request. A Pull -Request is how code from a local repository becomes available to the -GitHub community and can be looked at and eventually merged into the -master version. This Pull Request and its associated changes will -eventually be committed to the master branch and available in the next -release. To submit a Pull Request: - -1. Navigate to your repository on GitHub. -2. Click on the Pull Request button. -3. You can then click on Commits and Files Changed to make sure - everything looks okay one last time. -4. Write a description of your changes in the Preview Discussion tab. -5. Click Send Pull Request. - -This request then appears to the repository maintainers, and they will -review the code. If you need to make more changes, you can make them in -your branch, push them to GitHub, and the pull request will be -automatically updated. Pushing them to GitHub again is done by: - - git push -f origin shiny-new-feature - -This will automatically update your Pull Request with the latest code -and restart the Travis-CI tests. - -### Delete your merged branch (optional) - -Once your feature branch is accepted into upstream, you'll probably want -to get rid of the branch. First, merge upstream master into your branch -so git knows it is safe to delete your branch : - - git fetch upstream - git checkout master - git merge upstream/master - -Then you can just do: - - git branch -d shiny-new-feature - -Make sure you use a lower-case -d, or else git won't warn you if your -feature branch has not actually been merged. - -The branch will still exist on GitHub, so to delete it there do : - - git push origin --delete shiny-new-feature diff --git a/LICENSE b/LICENSE index c9b8834e8774b..924de26253bf4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,87 +1,29 @@ -======= -License -======= +BSD 3-Clause License -pandas is distributed under a 3-clause ("Simplified" or "New") BSD -license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have -BSD-compatible licenses, are included. Their licenses follow the pandas -license. - -pandas license -============== - -Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team -All rights reserved. - -Copyright (c) 2008-2011 AQR Capital Management, LLC +Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the copyright holder nor the names of any - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -About the Copyright Holders -=========================== - -AQR Capital Management began pandas development in 2008. Development was -led by Wes McKinney. AQR released the source under this license in 2009. -Wes is now an employee of Lambda Foundry, and remains the pandas project -lead. - -The PyData Development Team is the collection of developers of the PyData -project. This includes all of the PyData sub-projects, including pandas. The -core team that coordinates development on GitHub can be found here: -http://github.com/pydata. - -Full credits for pandas contributors can be found in the documentation. - -Our Copyright Policy -==================== - -PyData uses a shared copyright model. Each contributor maintains copyright -over their contributions to PyData. However, it is important to note that -these contributions are typically only changes to the repositories. Thus, -the PyData source code, in its entirety, is not the copyright of any single -person or institution. Instead, it is the collective copyright of the -entire PyData Development Team. If individual contributors want to maintain -a record of what changes/contributions they have specific copyright on, -they should indicate their copyright in the commit message of the change -when they commit the change to one of the PyData repositories. - -With this in mind, the following banner should be used in any source code -file to indicate the copyright and license terms: - -#----------------------------------------------------------------------------- -# Copyright (c) 2012, PyData Development Team -# All rights reserved. -# -# Distributed under the terms of the BSD Simplified License. -# -# The full license is in the LICENSE file, distributed with this software. -#----------------------------------------------------------------------------- - -Other licenses can be found in the LICENSES directory. \ No newline at end of file diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE new file mode 100644 index 0000000000000..8fbf194013e93 --- /dev/null +++ b/LICENSES/SAS7BDAT_LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2015 Jared Hobbs + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index defca46e7f820..3b2886eb9cfae 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -25,10 +25,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file + * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE new file mode 100644 index 0000000000000..37ec93a14fdcd --- /dev/null +++ b/LICENSES/XARRAY_LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in index 2d26fbfd6adaf..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,28 +1,41 @@ include MANIFEST.in include LICENSE include RELEASE.md -include README.rst +include README.md include setup.py graft doc prune doc/build -graft examples +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py +include pandas/io/formats/templates/*.tpl diff --git a/Makefile b/Makefile index 9a768932b8bea..4a82566cf726e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -tseries: pandas/lib.pyx pandas/tslib.pyx pandas/hashtable.pyx +tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx python setup.py build_ext --inplace .PHONY : develop build clean clean_pyc tseries doc @@ -9,12 +9,12 @@ clean: clean_pyc: -find . -name '*.py[co]' -exec rm {} \; -sparse: pandas/src/sparse.pyx - python setup.py build_ext --inplace - build: clean_pyc python setup.py build_ext --inplace +lint-diff: + git diff master --name-only -- "*.py" | grep "pandas" | xargs flake8 + develop: build -python setup.py develop @@ -23,3 +23,4 @@ doc: cd doc; \ python make.py clean; \ python make.py html + python make.py spellcheck diff --git a/README.md b/README.md index 66e7605a63142..3c8fe57400099 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,93 @@ +
+
+
+ +----------------- + # pandas: powerful Python data analysis toolkit - + - + - + + - + + + + + - + + + + +   - + + + + +
Latest Releaselatest release + + latest release + +
latest release + + latest release + +
Package Statusstatus + + status
Licenselicense + + license + +
Build Status - - build status + + travis build status + +
+ + circleci build status
Conda - - conda downloads + + appveyor build status + +
Coverage + + coverage
PyPIDownloads - - pypi downloads + + conda-forge downloads
Gitter + + +
-[![https://gitter.im/pydata/pandas](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + ## What is it @@ -89,125 +134,49 @@ Here are just a few of the things that pandas does well: moving window linear regressions, date shifting and lagging, etc. - [missing-data]: http://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data - [insertion-deletion]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion - [alignment]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures - [groupby]: http://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine - [conversion]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe - [slicing]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges - [fancy-indexing]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix - [subsetting]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing - [merging]: http://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging - [joining]: http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index - [reshape]: http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables - [pivot-table]: http://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations - [mi]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex - [flat-files]: http://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files - [excel]: http://pandas.pydata.org/pandas-docs/stable/io.html#excel-files - [db]: http://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries - [hdfstore]: http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables - [timeseries]: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data + [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion + [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures + [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine + [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe + [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges + [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix + [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing + [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging + [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index + [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables + [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations + [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex + [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files + [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files + [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries + [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables + [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality ## Where to get it The source code is currently hosted on GitHub at: -http://github.com/pydata/pandas +https://github.com/pandas-dev/pandas -Binary installers for the latest released version are available at the Python -package index - - http://pypi.python.org/pypi/pandas/ - -And via `easy_install`: +Binary installers for the latest released version are available at the [Python +package index](https://pypi.org/project/pandas) and on conda. ```sh -easy_install pandas +# conda +conda install pandas ``` -or `pip`: - ```sh +# or PyPI pip install pandas ``` -or `conda`: - -```sh -conda install pandas -``` - ## Dependencies -- [NumPy](http://www.numpy.org): 1.7.0 or higher -- [python-dateutil](http://labix.org/python-dateutil): 1.5 or higher -- [pytz](http://pytz.sourceforge.net) - - Needed for time zone support with ``pandas.date_range`` - -### Highly Recommended Dependencies -- [numexpr](https://github.com/pydata/numexpr) - - Needed to accelerate some expression evaluation operations - - Required by PyTables -- [bottleneck](http://berkeleyanalytics.com/bottleneck) - - Needed to accelerate certain numerical operations - -### Optional dependencies -- [Cython](http://www.cython.org): Only necessary to build development version. Version 0.17.1 or higher. -- [SciPy](http://www.scipy.org): miscellaneous statistical functions -- [PyTables](http://www.pytables.org): necessary for HDF5-based storage -- [SQLAlchemy](http://www.sqlalchemy.org): for SQL database support. Version 0.8.1 or higher recommended. -- [matplotlib](http://matplotlib.sourceforge.net/): for plotting -- [statsmodels](http://statsmodels.sourceforge.net/) - - Needed for parts of `pandas.stats` -- For Excel I/O: - - [xlrd/xlwt](http://www.python-excel.org/) - - Excel reading (xlrd) and writing (xlwt) - - [openpyxl](http://packages.python.org/openpyxl/) - - openpyxl version 1.6.1 or higher, but lower than 2.0.0, for - writing .xlsx files - - xlrd >= 0.9.0 - - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - - Alternative Excel writer. -- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - - Needed for `pandas.io.gbq` -- [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. -- One of the following combinations of libraries is needed to use the - top-level [`pandas.read_html`][read-html-docs] function: - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] (Any - recent version of [html5lib][html5lib] is okay.) - - [BeautifulSoup4][BeautifulSoup4] and [lxml][lxml] - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] and [lxml][lxml] - - Only [lxml][lxml], although see [HTML reading gotchas][html-gotchas] - for reasons as to why you should probably **not** take this approach. - -#### Notes about HTML parsing libraries -- If you install [BeautifulSoup4][BeautifulSoup4] you must install - either [lxml][lxml] or [html5lib][html5lib] or both. - `pandas.read_html` will **not** work with *only* `BeautifulSoup4` - installed. -- You are strongly encouraged to read [HTML reading - gotchas][html-gotchas]. It explains issues surrounding the - installation and usage of the above three libraries. -- You may need to install an older version of - [BeautifulSoup4][BeautifulSoup4]: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian -- Additionally, if you're using [Anaconda][Anaconda] you should - definitely read [the gotchas about HTML parsing][html-gotchas] - libraries -- If you're on a system with `apt-get` you can do - - ```sh - sudo apt-get build-dep python-lxml - ``` - - to get the necessary dependencies for installation of [lxml][lxml]. - This will prevent further headaches down the line. - - [html5lib]: https://github.com/html5lib/html5lib-python "html5lib" - [BeautifulSoup4]: http://www.crummy.com/software/BeautifulSoup "BeautifulSoup4" - [lxml]: http://lxml.de - [Anaconda]: https://store.continuum.io/cshop/anaconda - [NumPy]: http://numpy.scipy.org/ - [html-gotchas]: http://pandas.pydata.org/pandas-docs/stable/gotchas.html#html-table-parsing - [read-html-docs]: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.html.read_html.html#pandas.io.html.read_html +- [NumPy](https://www.numpy.org): 1.9.0 or higher +- [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher +- [pytz](https://pythonhosted.org/pytz): 2011k or higher + +See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) +for recommended and optional dependencies. ## Installation from sources To install pandas from source you need Cython in addition to the normal @@ -238,32 +207,36 @@ mode](https://pip.pypa.io/en/latest/reference/pip_install.html#editable-installs pip install -e . ``` -On Windows, you will need to install MinGW and execute: - -```sh -python setup.py build --compiler=mingw32 -python setup.py install -``` - -See http://pandas.pydata.org/ for more information. +See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/install.html#installing-from-source). ## License -BSD +[BSD 3](LICENSE) ## Documentation -The official documentation is hosted on PyData.org: http://pandas.pydata.org/ - -The Sphinx documentation should provide a good starting point for learning how -to use the library. Expect the docs to continue to expand as time goes on. +The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable ## Background Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and has been under active development since then. +## Getting Help + +For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pandas). +Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). + ## Discussion and Development -Since pandas development is related to a number of other scientific -Python projects, questions are welcome on the scipy-user mailing -list. Specialized discussions or design issues should take place on -the PyData mailing list / Google group: +Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. + +## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) + +All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. + +A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)** + +If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. + +You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas). + +Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! -https://groups.google.com/forum/#!forum/pydata +Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). diff --git a/RELEASE.md b/RELEASE.md index 23c1817b7647c..efd075dabcba9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to pandas between each release can be found +The list of changes to Pandas between each release can be found [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full -details, see the commit logs at http://github.com/pydata/pandas. +details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/appveyor.yml b/appveyor.yml index 9cec7895f1493..f70fc829ec971 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,38 +1,89 @@ +# With infos from +# http://tjelvarolsson.com/blog/how-to-continuously-test-your-python-code-on-windows-using-appveyor/ +# https://packaging.python.org/en/latest/appveyor/ +# https://github.com/rmcgibbo/python-appveyor-conda-example + +# Backslashes in quotes need to be escaped: \ -> "\\" + +matrix: + fast_finish: true # immediately finish build once one of the jobs fails. + environment: global: # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the - # /E:ON and /V:ON options are not enabled in the batch script intepreter + # /E:ON and /V:ON options are not enabled in the batch script interpreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" + clone_folder: C:\projects\pandas + PANDAS_TESTING_MODE: "deprecate" matrix: - - PYTHON: "C:\\Python27_32" - PYTHON_VERSION: "2.7" - PYTHON_ARCH: "32" - - PYTHON: "C:\\Python27_64" + - CONDA_ROOT: "C:\\Miniconda3_64" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "64" + CONDA_PY: "36" + CONDA_NPY: "113" + + - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" + CONDA_PY: "27" + CONDA_NPY: "110" - - PYTHON: "C:\\Python34_32" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "32" +# We always use a 64-bit machine, but can build x86 distributions +# with the PYTHON_ARCH variable (which is used by CMD_IN_ENV). +platform: + - x64 - - PYTHON: "C:\\Python34_64" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "64" +# all our python builds have to happen in tests_script... +build: false install: - # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit), - # as well as pip, conda-build, and the binstar CLI + # cancel older builds for the same PR + - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` + https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` + Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` + throw "There are newer queued builds for this pull request, failing early." } + + # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit) + # updates conda & installs: conda-build jinja2 anaconda-client + - powershell .\ci\install.ps1 + - SET PATH=%CONDA_ROOT%;%CONDA_ROOT%\Scripts;%PATH% - echo "install" - cd - ls -ltr - - powershell .\\ci\\install_appveyor.ps1 - - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - git tag --sort v:refname -build: false + # this can conflict with git + - cmd: rmdir C:\cygwin /s /q + + # install our build environment + - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false + - cmd: conda update -q conda + - cmd: conda config --set ssl_verify false + + # add the pandas channel *before* defaults to have defaults take priority + - cmd: conda config --add channels conda-forge + - cmd: conda config --add channels pandas + - cmd: conda config --remove channels defaults + - cmd: conda config --add channels defaults + + # this is now the downloaded conda... + - cmd: conda info -a + + # create our env + - cmd: conda env create -q -n pandas --file=ci\appveyor-%CONDA_PY%.yaml + - cmd: activate pandas + - cmd: conda list -n pandas + # uninstall pandas if it's present + - cmd: conda remove pandas -y --force & exit 0 + - cmd: pip uninstall -y pandas & exit 0 + + # build em using the local source checkout in the correct windows env + - cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace' test_script: - - "%CMD_IN_ENV% %PYTHON%/python.exe setup.py build_ext --inplace" - - "%PYTHON%/Scripts/nosetests -A \"not slow and not network and not disabled\" pandas" + # tests + - cmd: activate pandas + - cmd: test.bat diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index dcea59545aae3..9c333f62810f4 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -21,32 +21,70 @@ "environment_type": "conda", // the base URL to show a commit for the project. - "show_commit_url": "https://github.com/pydata/pandas/commit/", + "show_commit_url": "https://github.com/pandas-dev/pandas/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["2.7", "3.4"], - "pythons": ["2.7"], + "pythons": ["3.6"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty - // list indicates to just test against the default (latest) - // version. + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). "matrix": { - // To run against multiple versions, replace with - // "numpy": ["1.7", "1.9"], "numpy": [], "Cython": [], "matplotlib": [], "sqlalchemy": [], "scipy": [], "numexpr": [], - "pytables": [], + "pytables": [null, ""], // platform dependent, see excludes below + "tables": [null, ""], "openpyxl": [], + "xlsxwriter": [], "xlrd": [], - "xlwt": [] + "xlwt": [], + "pytest": [], + // If using Windows with python 2.7 and want to build using the + // mingw toolchain (rather than MSVC), uncomment the following line. + // "libpython": [], }, + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + "exclude": [ + // On conda install pytables, otherwise tables + {"environment_type": "conda", "tables": ""}, + {"environment_type": "conda", "pytables": null}, + {"environment_type": "(?!conda).*", "tables": null}, + {"environment_type": "(?!conda).*", "pytables": ""}, + ], + "include": [], + // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" // "benchmark_dir": "benchmarks", @@ -55,7 +93,6 @@ // environments in. If not provided, defaults to "env" // "env_dir": "env", - // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". // "results_dir": "results", @@ -65,5 +102,25 @@ // "html_dir": "html", // The number of characters to retain in the commit hashes. - // "hash_length": 8 + // "hash_length": 8, + + // `asv` will cache wheels of the recent builds in each + // environment, making them faster to install next time. This is + // number of builds to keep, per environment. + "wheel_cache_size": 8, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + "regressions_first_commits": { + ".*": "v0.20.0" + }, + "regression_thresholds": { + ".*": 0.05 + } } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py new file mode 100644 index 0000000000000..cccd38ef11251 --- /dev/null +++ b/asv_bench/benchmarks/algorithms.py @@ -0,0 +1,128 @@ +import warnings +from importlib import import_module + +import numpy as np +import pandas as pd +from pandas.util import testing as tm + +for imp in ['pandas.util', 'pandas.tools.hashing']: + try: + hashing = import_module(imp) + break + except: + pass + +from .pandas_vb_common import setup # noqa + + +class Factorize(object): + + goal_time = 0.2 + + params = [True, False] + param_names = ['sort'] + + def setup(self, sort): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) + + def time_factorize_int(self, sort): + self.int_idx.factorize(sort=sort) + + def time_factorize_float(self, sort): + self.float_idx.factorize(sort=sort) + + def time_factorize_string(self, sort): + self.string_idx.factorize(sort=sort) + + +class Duplicated(object): + + goal_time = 0.2 + + params = ['first', 'last', False] + param_names = ['keep'] + + def setup(self, keep): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) + + def time_duplicated_int(self, keep): + self.int_idx.duplicated(keep=keep) + + def time_duplicated_float(self, keep): + self.float_idx.duplicated(keep=keep) + + def time_duplicated_string(self, keep): + self.string_idx.duplicated(keep=keep) + + +class DuplicatedUniqueIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.idx_int_dup.is_unique + + def time_duplicated_unique_int(self): + self.idx_int_dup.duplicated() + + +class Match(object): + + goal_time = 0.2 + + def setup(self): + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) + + def time_match_string(self): + with warnings.catch_warnings(record=True): + pd.match(self.all, self.uniques) + + +class Hashing(object): + + goal_time = 0.2 + + def setup_cache(self): + N = 10**5 + + df = pd.DataFrame( + {'strings': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'floats': np.random.randn(N), + 'ints': np.arange(N), + 'dates': pd.date_range('20110101', freq='s', periods=N), + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) + df['categories'] = df['strings'].astype('category') + df.iloc[10:20] = np.nan + return df + + def time_frame(self, df): + hashing.hash_pandas_object(df) + + def time_series_int(self, df): + hashing.hash_pandas_object(df['ints']) + + def time_series_string(self, df): + hashing.hash_pandas_object(df['strings']) + + def time_series_float(self, df): + hashing.hash_pandas_object(df['floats']) + + def time_series_categorical(self, df): + hashing.hash_pandas_object(df['categories']) + + def time_series_timedeltas(self, df): + hashing.hash_pandas_object(df['timedeltas']) + + def time_series_dates(self, df): + hashing.hash_pandas_object(df['dates']) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 2b10cb88a3134..48f0b7d71144c 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,23 +1,40 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame +try: + from pandas.util import cache_readonly +except ImportError: + from pandas.util.decorators import cache_readonly +from .pandas_vb_common import setup # noqa + + +class DataFrameAttributes(object): -class getattr_dataframe_index(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index - def time_getattr_dataframe_index(self): + def time_get_index(self): self.foo = self.df.index + def time_set_index(self): + self.df.index = self.cur_index + + +class CacheReadonly(object): -class setattr_dataframe_index(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10, 6)) - self.cur_index = self.df.index - def time_setattr_dataframe_index(self): - self.df.index = self.cur_index \ No newline at end of file + class Foo: + + @cache_readonly + def prop(self): + return 5 + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index d22d01f261b27..cc8766e1fa39c 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,261 +1,151 @@ -from .pandas_vb_common import * -import pandas.computation.expressions as expr +import numpy as np +from pandas import DataFrame, Series, date_range +from pandas.core.algorithms import checked_add_with_arr +try: + import pandas.core.computation.expressions as expr +except ImportError: + import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa -class frame_add(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) +class Ops(object): - def time_frame_add(self): - (self.df + self.df2) - - -class frame_add_no_ne(object): goal_time = 0.2 - def setup(self): + params = [[True, False], ['default', 1]] + param_names = ['use_numexpr', 'threads'] + + def setup(self, use_numexpr, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_add_no_ne(self): - (self.df + self.df2) - def teardown(self): - expr.set_use_numexpr(True) + if threads != 'default': + expr.set_numexpr_threads(threads) + if not use_numexpr: + expr.set_use_numexpr(False) + def time_frame_add(self, use_numexpr, threads): + self.df + self.df2 -class frame_add_st(object): - goal_time = 0.2 + def time_frame_mult(self, use_numexpr, threads): + self.df * self.df2 - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) + def time_frame_multi_and(self, use_numexpr, threads): + self.df[(self.df > 0) & (self.df2 > 0)] - def time_frame_add_st(self): - (self.df + self.df2) + def time_frame_comparison(self, use_numexpr, threads): + self.df > self.df2 - def teardown(self): + def teardown(self, use_numexpr, threads): + expr.set_use_numexpr(True) expr.set_numexpr_threads() -class frame_float_div(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_float_div(self): - (self.df // self.df2) +class Ops2(object): - -class frame_float_div_by_zero(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) + N = 10**3 + self.df = DataFrame(np.random.randn(N, N)) + self.df2 = DataFrame(np.random.randn(N, N)) - def time_frame_float_div_by_zero(self): - (self.df / 0) + self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) + self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) + # Division -class frame_float_floor_by_zero(object): - goal_time = 0.2 + def time_frame_float_div(self): + self.df // self.df2 - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) + def time_frame_float_div_by_zero(self): + self.df / 0 def time_frame_float_floor_by_zero(self): - (self.df // 0) - - -class frame_float_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_float_mod(self): - (self.df / self.df2) - - -class frame_int_div_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + self.df // 0 def time_frame_int_div_by_zero(self): - (self.df / 0) + self.df_int / 0 - -class frame_int_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) - self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + # Modulo def time_frame_int_mod(self): - (self.df / self.df2) - - -class frame_mult(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_mult(self): - (self.df * self.df2) - + self.df_int % self.df2_int -class frame_mult_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_mult_no_ne(self): - (self.df * self.df2) - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_mult_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_mult_st(self): - (self.df * self.df2) - - def teardown(self): - expr.set_numexpr_threads() - - -class frame_multi_and(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_multi_and(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - -class frame_multi_and_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_multi_and_no_ne(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_multi_and_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_multi_and_st(self): - self.df[((self.df > 0) & (self.df2 > 0))] + def time_frame_float_mod(self): + self.df % self.df2 - def teardown(self): - expr.set_numexpr_threads() +class Timeseries(object): -class series_timestamp_compare(object): goal_time = 0.2 - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T')) - self.ts = self.s[self.halfway] + params = [None, 'US/Eastern'] + param_names = ['tz'] - def time_series_timestamp_compare(self): - (self.s <= self.ts) + def setup(self, tz): + N = 10**6 + halfway = (N // 2) - 1 + self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.ts = self.s[halfway] + self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) -class timestamp_ops_diff1(object): - goal_time = 0.2 - N = 1000000 + def time_series_timestamp_compare(self, tz): + self.s <= self.ts - def setup(self): - self.s = self.create() + def time_timestamp_series_compare(self, tz): + self.ts >= self.s - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + def time_timestamp_ops_diff(self, tz): + self.s2.diff() - def time_timestamp_ops_diff1(self): - self.s.diff() + def time_timestamp_ops_diff_with_shift(self, tz): + self.s - self.s.shift() -class timestamp_tz_ops_diff1(timestamp_ops_diff1): - N = 10000 - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) +class AddOverflowScalar(object): -class timestamp_ops_diff2(object): goal_time = 0.2 - N = 1000000 - def setup(self): - self.s = self.create() + params = [1, -1, 0] + param_names = ['scalar'] - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) - def time_timestamp_ops_diff2(self): - (self.s - self.s.shift()) + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) -class timestamp_tz_ops_diff2(timestamp_ops_diff2): - N = 10000 - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) +class AddOverflowArray(object): -class timestamp_series_compare(object): goal_time = 0.2 - N = 1000000 def setup(self): - self.halfway = ((self.N // 2) - 1) - self.s = self.create() - self.ts = self.s[self.halfway] + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T')) + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) - def time_timestamp_series_compare(self): - (self.ts >= self.s) + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) -class timestamp_tz_series_compare(timestamp_series_compare): - N = 10000 + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) + def time_add_overflow_both_arg_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0f9383336940..2a7717378c280 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,45 +1,247 @@ -from .pandas_vb_common import * -import string +import warnings + +import numpy as np +import pandas as pd +import pandas.util.testing as tm +try: + from pandas.api.types import union_categoricals +except ImportError: + try: + from pandas.types.concat import union_categoricals + except ImportError: + pass + +from .pandas_vb_common import setup # noqa + + +class Concat(object): -class concat_categorical(object): goal_time = 0.2 def setup(self): - self.s = pd.Series((list('aabbcd') * 1000000)).astype('category') + N = 10**5 + self.s = pd.Series(list('aabbcd') * N).astype('category') + + self.a = pd.Categorical(list('aabbcd') * N) + self.b = pd.Categorical(list('bbcdjk') * N) + + def time_concat(self): + pd.concat([self.s, self.s]) - def time_concat_categorical(self): - concat([self.s, self.s]) + def time_union(self): + union_categoricals([self.a, self.b]) -class categorical_value_counts(object): - goal_time = 1 +class Constructor(object): + + goal_time = 0.2 def setup(self): - n = 500000 - np.random.seed(2718281) + N = 10**5 + self.categories = list('abcde') + self.cat_idx = pd.Index(self.categories) + self.values = np.tile(self.categories, N) + self.codes = np.tile(range(len(self.categories)), N) + + self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', + periods=N / 10, + freq='s')) + self.datetimes_with_nat = self.datetimes.copy() + self.datetimes_with_nat.iloc[-1] = pd.NaT + + self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) + self.values_all_nan = [np.nan] * len(self.values) + self.values_all_int8 = np.ones(N, 'int8') + + def time_regular(self): + pd.Categorical(self.values, self.categories) + + def time_fastpath(self): + pd.Categorical(self.codes, self.cat_idx, fastpath=True) + + def time_datetimes(self): + pd.Categorical(self.datetimes) + + def time_datetimes_with_nat(self): + pd.Categorical(self.datetimes_with_nat) + + def time_with_nan(self): + pd.Categorical(self.values_some_nan) + + def time_all_nan(self): + pd.Categorical(self.values_all_nan) + + def time_from_codes_all_int8(self): + pd.Categorical.from_codes(self.values_all_int8, self.categories) + + +class ValueCounts(object): + + goal_time = 0.2 + + params = [True, False] + param_names = ['dropna'] + + def setup(self, dropna): + n = 5 * 10**5 arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') + self.ts = pd.Series(arr).astype('category') + + def time_value_counts(self, dropna): + self.ts.value_counts(dropna=dropna) - def time_value_counts(self): - self.ts.value_counts(dropna=False) - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) +class Repr(object): -class categorical_constructor(object): goal_time = 0.2 def setup(self): - n = 5 - N = 1e6 - self.categories = list(string.ascii_letters[:n]) - self.cat_idx = Index(self.categories) - self.values = np.tile(self.categories, N) - self.codes = np.tile(range(n), N) + self.sel = pd.Series(['s1234']).astype('category') - def time_regular_constructor(self): - Categorical(self.values, self.categories) + def time_rendering(self): + str(self.sel) - def time_fastpath(self): - Categorical(self.codes, self.cat_idx, fastpath=True) +class SetCategories(object): + + goal_time = 0.2 + + def setup(self): + n = 5 * 10**5 + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype('category') + + def time_set_categories(self): + self.ts.cat.set_categories(self.ts.cat.categories[::2]) + + +class Rank(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + ncats = 100 + + self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str_cat = self.s_str.astype('category') + with warnings.catch_warnings(record=True): + self.s_str_cat_ordered = self.s_str.astype('category', + ordered=True) + + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) + self.s_int_cat = self.s_int.astype('category') + with warnings.catch_warnings(record=True): + self.s_int_cat_ordered = self.s_int.astype('category', + ordered=True) + + def time_rank_string(self): + self.s_str.rank() + + def time_rank_string_cat(self): + self.s_str_cat.rank() + + def time_rank_string_cat_ordered(self): + self.s_str_cat_ordered.rank() + + def time_rank_int(self): + self.s_int.rank() + + def time_rank_int_cat(self): + self.s_int_cat.rank() + + def time_rank_int_cat_ordered(self): + self.s_int_cat_ordered.rank() + + +class Isin(object): + + goal_time = 0.2 + + params = ['object', 'int64'] + param_names = ['dtype'] + + def setup(self, dtype): + np.random.seed(1234) + n = 5 * 10**5 + sample_size = 100 + arr = [i for i in np.random.randint(0, n // 10, size=n)] + if dtype == 'object': + arr = ['s%04d' % i for i in arr] + self.sample = np.random.choice(arr, sample_size) + self.series = pd.Series(arr).astype('category') + + def time_isin_categorical(self, dtype): + self.series.isin(self.sample) + + +class IsMonotonic(object): + + def setup(self): + N = 1000 + self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.s = pd.Series(self.c) + + def time_categorical_index_is_monotonic_increasing(self): + self.c.is_monotonic_increasing + + def time_categorical_index_is_monotonic_decreasing(self): + self.c.is_monotonic_decreasing + + def time_categorical_series_is_monotonic_increasing(self): + self.s.is_monotonic_increasing + + def time_categorical_series_is_monotonic_decreasing(self): + self.s.is_monotonic_decreasing + + +class Contains(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.ci = tm.makeCategoricalIndex(N) + self.c = self.ci.values + self.key = self.ci.categories[0] + + def time_categorical_index_contains(self): + self.key in self.ci + + def time_categorical_contains(self): + self.key in self.c + + +class CategoricalSlicing(object): + + goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] + + def setup(self, index): + N = 10**6 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': pd.Categorical(values), + 'monotonic_decr': pd.Categorical(reversed(values)), + 'non_monotonic': pd.Categorical(list('abc' * N))} + self.data = indices[index] + + self.scalar = 10000 + self.list = list(range(10000)) + self.cat_scalar = 'b' + + def time_getitem_scalar(self, index): + self.data[self.scalar] + + def time_getitem_slice(self, index): + self.data[:self.scalar] + + def time_getitem_list_like(self, index): + self.data[[self.scalar]] + + def time_getitem_list(self, index): + self.data[self.list] + + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 265ffbc7261ca..3f9016787aab4 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,52 +1,66 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex +from .pandas_vb_common import setup # noqa -class frame_constructor_ndarray(object): - goal_time = 0.2 - - def setup(self): - self.arr = np.random.randn(100, 100) - def time_frame_constructor_ndarray(self): - DataFrame(self.arr) +class SeriesConstructors(object): - -class ctor_index_array_string(object): goal_time = 0.2 - def setup(self): - self.data = np.array(['foo', 'bar', 'baz'], dtype=object) + param_names = ["data_fmt", "with_index"] + params = [[lambda x: x, + list, + lambda arr: list(arr.astype(str)), + lambda arr: dict(zip(range(len(arr)), arr)), + lambda arr: [(i, -i) for i in arr], + lambda arr: [[i, -i] for i in arr], + lambda arr: ([(i, -i) for i in arr][:-1] + [None]), + lambda arr: ([[i, -i] for i in arr][:-1] + [None])], + [False, True]] + + def setup(self, data_fmt, with_index): + N = 10**4 + arr = np.random.randn(N) + self.data = data_fmt(arr) + self.index = np.arange(N) if with_index else None + + def time_series_constructor(self, data_fmt, with_index): + Series(self.data, index=self.index) - def time_ctor_index_array_string(self): - Index(self.data) +class SeriesDtypesConstructors(object): -class series_constructor_ndarray(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100) - self.index = Index(np.arange(100)) + N = 10**4 + self.arr = np.random.randn(N, N) + self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) + self.s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * N * 10) - def time_series_constructor_ndarray(self): - Series(self.data, index=self.index) + def time_index_from_array_string(self): + Index(self.arr_str) + def time_index_from_array_floats(self): + Index(self.arr) -class dtindex_from_series_ctor(object): - goal_time = 0.2 + def time_dtindex_from_series(self): + DatetimeIndex(self.s) - def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + def time_dtindex_from_index_with_series(self): + Index(self.s) - def time_dtindex_from_series_ctor(self): - DatetimeIndex(self.s) +class MultiIndexConstructor(object): -class index_from_series_ctor(object): goal_time = 0.2 def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + N = 10**4 + self.iterables = [tm.makeStringIndex(N), range(20)] - def time_index_from_series_ctor(self): - Index(self.s) \ No newline at end of file + def time_multiindex_from_iterables(self): + MultiIndex.from_product(self.iterables) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 719d92567a7be..8e581dcf22b4c 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,239 +1,67 @@ -from .pandas_vb_common import * +import numpy as np import pandas as pd -import pandas.computation.expressions as expr +try: + import pandas.core.computation.expressions as expr +except ImportError: + import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa -class eval_frame_add_all_threads(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_add_all_threads(self): - pd.eval('df + df2 + df3 + df4') - - -class eval_frame_add_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_add_one_thread(self): - pd.eval('df + df2 + df3 + df4') - - -class eval_frame_add_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_add_python(self): - pd.eval('df + df2 + df3 + df4', engine='python') - - -class eval_frame_add_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_add_python_one_thread(self): - pd.eval('df + df2 + df3 + df4', engine='python') - - -class eval_frame_and_all_threads(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_and_all_threads(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') - - -class eval_frame_and_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_and_python_one_thread(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') - - -class eval_frame_and_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_and_python(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') +class Eval(object): -class eval_frame_chained_cmp_all_threads(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_chained_cmp_all_threads(self): - pd.eval('df < df2 < df3 < df4') - - -class eval_frame_chained_cmp_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_chained_cmp_python_one_thread(self): - pd.eval('df < df2 < df3 < df4', engine='python') - - -class eval_frame_chained_cmp_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_chained_cmp_python(self): - pd.eval('df < df2 < df3 < df4', engine='python') - - -class eval_frame_mult_all_threads(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_mult_all_threads(self): - pd.eval('df * df2 * df3 * df4') + params = [['numexpr', 'python'], [1, 'all']] + param_names = ['engine', 'threads'] + def setup(self, engine, threads): + self.df = pd.DataFrame(np.random.randn(20000, 100)) + self.df2 = pd.DataFrame(np.random.randn(20000, 100)) + self.df3 = pd.DataFrame(np.random.randn(20000, 100)) + self.df4 = pd.DataFrame(np.random.randn(20000, 100)) -class eval_frame_mult_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_mult_one_thread(self): - pd.eval('df * df2 * df3 * df4') - + if threads == 1: + expr.set_numexpr_threads(1) -class eval_frame_mult_python(object): - goal_time = 0.2 + def time_add(self, engine, threads): + pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) + def time_and(self, engine, threads): + pd.eval('(self.df > 0) & (self.df2 > 0) & ' + '(self.df3 > 0) & (self.df4 > 0)', engine=engine) - def time_eval_frame_mult_python(self): - pd.eval('df * df2 * df3 * df4', engine='python') + def time_chained_cmp(self, engine, threads): + pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) + def time_mult(self, engine, threads): + pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) -class eval_frame_mult_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) + def teardown(self, engine, threads): + expr.set_numexpr_threads() - def time_eval_frame_mult_python_one_thread(self): - pd.eval('df * df2 * df3 * df4', engine='python') +class Query(object): -class query_datetime_index(object): goal_time = 0.2 def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) + N = 10**6 + halfway = (N // 2) - 1 + index = pd.date_range('20010101', periods=N, freq='T') + s = pd.Series(index) + self.ts = s.iloc[halfway] + self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': s}, + index=index) + data = np.random.randn(N) + self.min_val = data.min() + self.max_val = data.max() def time_query_datetime_index(self): - self.df.query('index < @ts') + self.df.query('index < @self.ts') - -class query_datetime_series(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'dates': self.s.values, }) - - def time_query_datetime_series(self): - self.df.query('dates < @ts') - - -class query_with_boolean_selection(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.N = 1000000 - self.df = DataFrame({'a': np.random.randn(self.N), }) - self.min_val = self.df['a'].min() - self.max_val = self.df['a'].max() + def time_query_datetime_column(self): + self.df.query('dates < @self.ts') def time_query_with_boolean_selection(self): - self.df.query('(a >= @min_val) & (a <= @max_val)') \ No newline at end of file + self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 85f3c1628bd8b..9def910df0bab 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,1706 +1,101 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: - from pandas.tseries.offsets import * -except: - from pandas.core.datetools import * + from pandas.tseries.offsets import Nano, Hour +except ImportError: + # For compatibility with older versions + from pandas.core.datetools import * # noqa - -class frame_ctor_dtindex_BDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_LastWeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) +from .pandas_vb_common import setup # noqa -class frame_ctor_dtindex_LastWeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) +class FromDicts(object): - def time_frame_ctor_dtindex_Weekx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Weekx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearBeginx1(object): goal_time = 0.2 def setup(self): - self.idx = self.get_index_for_offset(YearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) + N, K = 5000, 50 + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + frame = DataFrame(np.random.randn(N, K), index=self.index, + columns=self.columns) + self.data = frame.to_dict() + self.dict_list = frame.to_dict(orient='records') + self.data2 = {i: {j: float(j) for j in range(100)} + for i in range(2000)} + + def time_list_of_dict(self): + DataFrame(self.dict_list) + def time_nested_dict(self): + DataFrame(self.data) -class frame_ctor_dtindex_YearBeginx2(object): - goal_time = 0.2 + def time_nested_dict_index(self): + DataFrame(self.data, index=self.index) - def setup(self): - self.idx = self.get_index_for_offset(YearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) + def time_nested_dict_columns(self): + DataFrame(self.data, columns=self.columns) - def time_frame_ctor_dtindex_YearBeginx2(self): - DataFrame(self.d) + def time_nested_dict_index_columns(self): + DataFrame(self.data, index=self.index, columns=self.columns) - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + def time_nested_dict_int64(self): + # nested dict, integer indexes, regression described in #621 + DataFrame(self.data2) - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) +class FromSeries(object): -class frame_ctor_dtindex_YearEndx1(object): goal_time = 0.2 def setup(self): - self.idx = self.get_index_for_offset(YearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearEndx1(self): - DataFrame(self.d) + mi = MultiIndex.from_product([range(100), range(100)]) + self.s = Series(np.random.randn(10000), index=mi) - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + def time_mi_series(self): + DataFrame(self.s) - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) +class FromDictwithTimestamp(object): -class frame_ctor_dtindex_YearEndx2(object): goal_time = 0.2 + params = [Nano(1), Hour(1)] + param_names = ['offset'] - def setup(self): - self.idx = self.get_index_for_offset(YearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) + def setup(self, offset): + N = 10**3 + np.random.seed(1234) + idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N) + df = DataFrame(np.random.randn(N, 10), index=idx) + self.d = df.to_dict() - def time_frame_ctor_dtindex_YearEndx2(self): + def time_dict_with_timestamp_offsets(self, offset): DataFrame(self.d) - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_list_of_dict(object): - goal_time = 0.2 - - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - - def time_frame_ctor_list_of_dict(self): - DataFrame(self.dict_list) - - -class frame_ctor_nested_dict(object): - goal_time = 0.2 - - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - - def time_frame_ctor_nested_dict(self): - DataFrame(self.data) +class FromRecords(object): -class frame_ctor_nested_dict_int64(object): goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] - def setup(self): - self.data = dict(((i, dict(((j, float(j)) for j in range(100)))) for i in xrange(2000))) - - def time_frame_ctor_nested_dict_int64(self): - DataFrame(self.data) - - -class frame_from_series(object): - goal_time = 0.2 - - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) - self.s = Series(randn(10000), index=self.mi) + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) - def time_frame_from_series(self): - DataFrame(self.s) - - -class frame_get_numeric_data(object): - goal_time = 0.2 + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) - def setup(self): - self.df = DataFrame(randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' - self.df = self.df.consolidate() - - def time_frame_get_numeric_data(self): - self.df._get_numeric_data() +class FromNDArray(object): -class series_ctor_from_dict(object): goal_time = 0.2 def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + N = 100000 + self.data = np.random.randn(N) - def time_series_ctor_from_dict(self): - Series(self.some_dict) \ No newline at end of file + def time_frame_from_ndarray(self): + self.df = DataFrame(self.data) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index a04a9d0814a30..1819cfa2725db 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,950 +1,532 @@ -from .pandas_vb_common import * +import string +import warnings +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + isnull, NaT) -class frame_apply_axis_1(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) - +from .pandas_vb_common import setup # noqa -class frame_apply_lambda_mean(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_lambda_mean(self): - self.df.apply((lambda x: x.sum())) +class GetNumericData(object): -class frame_apply_np_mean(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_np_mean(self): - self.df.apply(np.mean) - - -class frame_apply_pass_thru(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_pass_thru(self): - self.df.apply((lambda x: x)) - - -class frame_apply_ref_by_name(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - - def time_frame_apply_ref_by_name(self): - self.df.apply((lambda x: (x['A'] + x['B'])), axis=1) - - -class frame_apply_user_func(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.arange(1028.0)) - self.df = DataFrame({i: self.s for i in range(1028)}) - - def time_frame_apply_user_func(self): - self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) - - -class frame_assign_timeseries_index(object): - goal_time = 0.2 - - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='D') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) - - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index - - -class frame_boolean_row_select(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - - -class frame_count_level_axis0_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_mixed_dtypes_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis0_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis1_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan + self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_mixed_dtypes_multi(self): - self.df.count(axis=1, level=1) - - -class frame_count_level_axis1_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_multi(self): - self.df.count(axis=1, level=1) - - -class frame_dropna_axis0_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis0_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis1_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) - - -class frame_dropna_axis1_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=1) - - -class frame_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_dtypes(self): - self.df.dtypes - - -class frame_duplicated(object): - goal_time = 0.2 - - def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - def time_frame_duplicated(self): - self.df.duplicated() + self.df['bar'] = 'baz' + with warnings.catch_warnings(record=True): + self.df = self.df.consolidate() -class frame_duplicated_wide(object): - goal_time = 0.2 + def time_frame_get_numeric_data(self): + self.df._get_numeric_data() - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100).astype(str)) - def time_frame_duplicated_wide(self): - self.df.T.duplicated() +class Lookup(object): -class frame_fancy_lookup(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df = DataFrame(np.random.randn(10000, 8), + columns=list('abcdefgh')) self.df['foo'] = 'bar' self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + self.col_labels = list(self.df.columns) * 100 + self.row_labels_all = np.array( + list(self.df.index) * len(self.df.columns), dtype='object') + self.col_labels_all = np.array( + list(self.df.columns) * len(self.df.index), dtype='object') def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) - -class frame_fancy_lookup_all(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) - self.df['foo'] = 'bar' - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - def time_frame_fancy_lookup_all(self): self.df.lookup(self.row_labels_all, self.col_labels_all) -class frame_fillna_inplace(object): - goal_time = 0.2 +class Reindex(object): - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - - -class frame_float_equal(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_equal(self): - self.test_equal('float_df') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) + self.df2 = DataFrame( + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) + def time_reindex_axis0(self): + self.df.reindex(self.idx) -class frame_float_unequal(object): - goal_time = 0.2 - - def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_unequal(self): - self.test_unequal('float_df') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) - - -class frame_from_records_generator(object): - goal_time = 0.2 - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) + def time_reindex_axis1(self): + self.df.reindex(columns=self.idx) - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) + def time_reindex_both_axes(self): + self.df.reindex(index=self.idx, columns=self.idx) + def time_reindex_both_axes_ix(self): + self.df.ix[self.idx, self.idx] -class frame_from_records_generator_nrows(object): - goal_time = 0.2 + def time_reindex_upcast(self): + self.df2.reindex(np.random.permutation(range(1200))) - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) +class Iteration(object): - -class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) - - def time_frame_get_dtype_counts(self): - self.df.get_dtype_counts() - + N = 1000 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.df2 = DataFrame(np.random.randn(N * 50, 10)) + self.df3 = DataFrame(np.random.randn(N, 5 * N), + columns=['C' + str(c) for c in range(N * 5)]) -class frame_getitem_single_column(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column(self): - self.h() - - def f(self): + def time_iteritems(self): + # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): + for name, col in self.df.iteritems(): pass - def g(self): - for (name, col) in self.df.iteritems(): + def time_iteritems_cached(self): + for name, col in self.df.iteritems(): pass - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - -class frame_getitem_single_column2(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column2(self): - self.j() + def time_iteritems_indexing(self): + for col in self.df3: + self.df3[col] - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): + def time_itertuples(self): + for row in self.df2.itertuples(): pass - def g(self): - for (name, col) in self.df.iteritems(): + def time_iterrows(self): + for row in self.df.iterrows(): pass - def h(self): - for i in range(10000): - self.df2['A'] - def j(self): - for i in range(10000): - self.df3[0] +class ToString(object): - -class frame_html_repr_trunc_mi(object): goal_time = 0.2 def setup(self): - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, (self.nrows / 100)), 100)) - self.df = DataFrame(self.data, index=self.idx) - - def time_frame_html_repr_trunc_mi(self): - self.df._repr_html_() - + self.df = DataFrame(np.random.randn(100, 10)) -class frame_html_repr_trunc_si(object): - goal_time = 0.2 - - def setup(self): - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = randn(self.nrows) - self.df = DataFrame(self.data, index=self.idx) + def time_to_string_floats(self): + self.df.to_string() - def time_frame_html_repr_trunc_si(self): - self.df._repr_html_() +class ToHTML(object): -class frame_insert_100_columns_begin(object): goal_time = 0.2 def setup(self): - self.N = 1000 + nrows = 500 + self.df2 = DataFrame(np.random.randn(nrows, 10)) + self.df2[0] = period_range('2000', periods=nrows) + self.df2[1] = range(nrows) - def time_frame_insert_100_columns_begin(self): - self.f() + def time_to_html_mixed(self): + self.df2.to_html() - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) +class Repr(object): -class frame_insert_500_columns_end(object): goal_time = 0.2 def setup(self): - self.N = 1000 - - def time_frame_insert_500_columns_end(self): - self.f() + nrows = 10000 + data = np.random.randn(nrows, 10) + arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) + idx = MultiIndex.from_arrays(arrays) + self.df3 = DataFrame(data, index=idx) + self.df4 = DataFrame(data, index=np.random.randn(nrows)) + self.df_tall = DataFrame(np.random.randn(nrows, 10)) + self.df_wide = DataFrame(np.random.randn(10, nrows)) - def f(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col + def time_html_repr_trunc_mi(self): + self.df3._repr_html_() + def time_html_repr_trunc_si(self): + self.df4._repr_html_() -class frame_interpolate(object): - goal_time = 0.2 + def time_repr_tall(self): + repr(self.df_tall) - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan + def time_frame_repr_wide(self): + repr(self.df_wide) - def time_frame_interpolate(self): - self.df.interpolate() +class MaskBool(object): -class frame_interpolate_some_good(object): goal_time = 0.2 def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan + data = np.random.randn(1000, 500) + df = DataFrame(data) + df = df.where(df > 0) + self.bools = df > 0 + self.mask = isnull(df) - def time_frame_interpolate_some_good(self): - self.df.interpolate() + def time_frame_mask_bools(self): + self.bools.mask(self.mask) + + def time_frame_mask_floats(self): + self.bools.astype(float).mask(self.mask) -class frame_interpolate_some_good_infer(object): +class Isnull(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan + N = 10**3 + self.df_no_null = DataFrame(np.random.randn(N, N)) - def time_frame_interpolate_some_good_infer(self): - self.df.interpolate(downcast='infer') + sample = np.array([np.nan, 1.0]) + data = np.random.choice(sample, (N, N)) + self.df = DataFrame(data) + sample = np.array(list(string.ascii_letters + string.whitespace)) + data = np.random.choice(sample, (N, N)) + self.df_strings = DataFrame(data) -class frame_isnull(object): - goal_time = 0.2 + sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + data = np.random.choice(sample, (N, N)) + self.df_obj = DataFrame(data) - def setup(self): - self.data = np.random.randn(1000, 1000) - self.df = DataFrame(self.data) + def time_isnull_floats_no_null(self): + isnull(self.df_no_null) - def time_frame_isnull(self): + def time_isnull(self): isnull(self.df) + def time_isnull_strngs(self): + isnull(self.df_strings) -class frame_iteritems(object): - goal_time = 0.2 + def time_isnull_obj(self): + isnull(self.df_obj) - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - def time_frame_iteritems(self): - self.f() +class Fillna(object): - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + goal_time = 0.2 + params = ([True, False], ['pad', 'bfill']) + param_names = ['inplace', 'method'] - def g(self): - for (name, col) in self.df.iteritems(): - pass + def setup(self, inplace, method): + values = np.random.randn(10000, 100) + values[::2] = np.nan + self.df = DataFrame(values) - def h(self): - for i in range(10000): - self.df2['A'] + def time_frame_fillna(self, inplace, method): + self.df.fillna(inplace=inplace, method=method) - def j(self): - for i in range(10000): - self.df3[0] +class Dropna(object): -class frame_iteritems_cached(object): goal_time = 0.2 + params = (['all', 'any'], [0, 1]) + param_names = ['how', 'axis'] - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_iteritems_cached(self): - self.g() - - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + def setup(self, how, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def g(self): - for (name, col) in self.df.iteritems(): - pass + def time_dropna(self, how, axis): + self.df.dropna(how=how, axis=axis) - def h(self): - for i in range(10000): - self.df2['A'] + def time_dropna_axis_mixed_dtypes(self, how, axis): + self.df_mixed.dropna(how=how, axis=axis) - def j(self): - for i in range(10000): - self.df3[0] +class Count(object): -class frame_mask_bools(object): goal_time = 0.2 - def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + params = [0, 1] + param_names = ['axis'] - def time_frame_mask_bools(self): - self.bools.mask(self.mask) + def setup(self, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' + self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, + self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, + self.df_mixed.index]) + self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, + self.df_mixed.columns]) -class frame_mask_floats(object): - goal_time = 0.2 + def time_count_level_multi(self, axis): + self.df.count(axis=axis, level=1) - def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + def time_count_level_mixed_dtypes_multi(self, axis): + self.df_mixed.count(axis=axis, level=1) - def time_frame_mask_floats(self): - self.bools.astype(float).mask(self.mask) +class Apply(object): -class frame_nonunique_equal(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + self.df = DataFrame(np.random.randn(1000, 100)) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + self.s = Series(np.arange(1028.0)) + self.df2 = DataFrame({i: self.s for i in range(1028)}) + self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + def time_apply_user_func(self): + self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) -class frame_nonunique_unequal(object): - goal_time = 0.2 + def time_apply_axis_1(self): + self.df.apply(lambda x: x + 1, axis=1) - def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + def time_apply_lambda_mean(self): + self.df.apply(lambda x: x.mean()) - def time_frame_nonunique_unequal(self): - self.test_unequal('nonunique_cols') + def time_apply_np_mean(self): + self.df.apply(np.mean) - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_apply_pass_thru(self): + self.df.apply(lambda x: x) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_apply_ref_by_name(self): + self.df3.apply(lambda x: x['A'] + x['B'], axis=1) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +class Dtypes(object): -class frame_object_equal(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_object_equal(self): - self.test_equal('object_df') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + self.df = DataFrame(np.random.randn(1000, 1000)) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_frame_dtypes(self): + self.df.dtypes - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +class Equals(object): -class frame_object_unequal(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_object_unequal(self): - self.test_unequal('object_df') + N = 10**3 + self.float_df = DataFrame(np.random.randn(N, N)) + self.float_df_nan = self.float_df.copy() + self.float_df_nan.iloc[-1, -1] = np.nan - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df_nan = self.object_df.copy() + self.object_df_nan.iloc[-1, -1] = np.nan - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols_nan = self.nonunique_cols.copy() + self.nonunique_cols_nan.iloc[-1, -1] = np.nan - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + def time_frame_float_equal(self): + self.float_df.equals(self.float_df) + def time_frame_float_unequal(self): + self.float_df.equals(self.float_df_nan) -class frame_reindex_axis0(object): - goal_time = 0.2 + def time_frame_nonunique_equal(self): + self.nonunique_cols.equals(self.nonunique_cols) - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + def time_frame_nonunique_unequal(self): + self.nonunique_cols.equals(self.nonunique_cols_nan) - def time_frame_reindex_axis0(self): - self.df.reindex(self.idx) + def time_frame_object_equal(self): + self.object_df.equals(self.object_df) + def time_frame_object_unequal(self): + self.object_df.equals(self.object_df_nan) -class frame_reindex_axis1(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) +class Interpolate(object): - def time_frame_reindex_axis1(self): - self.df.reindex(columns=self.idx) + goal_time = 0.2 + params = [None, 'infer'] + param_names = ['downcast'] + def setup(self, downcast): + N = 10000 + # this is the worst case, where every column has NaNs. + self.df = DataFrame(np.random.randn(N, 100)) + self.df.values[::2] = np.nan -class frame_reindex_both_axes(object): - goal_time = 0.2 + self.df2 = DataFrame({'A': np.arange(0, N), + 'B': np.random.randint(0, 100, N), + 'C': np.random.randn(N), + 'D': np.random.randn(N)}) + self.df2.loc[1::5, 'A'] = np.nan + self.df2.loc[1::5, 'C'] = np.nan - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + def time_interpolate(self, downcast): + self.df.interpolate(downcast=downcast) - def time_frame_reindex_both_axes(self): - self.df.reindex(index=self.idx, columns=self.idx) + def time_interpolate_some_good(self, downcast): + self.df2.interpolate(downcast=downcast) -class frame_reindex_both_axes_ix(object): +class Shift(object): + # frame shift speedup issue-5609 goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + def setup(self, axis): + self.df = DataFrame(np.random.rand(10000, 500)) - def time_frame_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] + def time_shift(self, axis): + self.df.shift(1, axis=axis) -class frame_reindex_upcast(object): - goal_time = 0.2 +class Nunique(object): def setup(self): - self.df = DataFrame(dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), 1: randint(0, 1000, 1000).astype(np.int16), 2: randint(0, 1000, 1000).astype(np.int32), 3: randint(0, 1000, 1000).astype(np.int64), }[randint(0, 4)]) for c in range(1000)])) + self.df = DataFrame(np.random.randn(10000, 1000)) + + def time_frame_nunique(self): + self.df.nunique() - def time_frame_reindex_upcast(self): - self.df.reindex(permutation(range(1200))) +class Duplicated(object): -class frame_repr_tall(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10000, 10)) + n = (1 << 20) + t = date_range('2015-01-01', freq='S', periods=(n // 64)) + xs = np.random.randn(n // 64).round(2) + self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), + 'b': np.random.choice(t, n), + 'c': np.random.choice(xs, n)}) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T - def time_frame_repr_tall(self): - repr(self.df) - - -class frame_repr_wide(object): - goal_time = 0.2 + def time_frame_duplicated(self): + self.df.duplicated() - def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + def time_frame_duplicated_wide(self): + self.df2.duplicated() - def time_frame_repr_wide(self): - repr(self.df) +class XS(object): -class frame_shift_axis0(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + def setup(self, axis): + self.N = 10**4 + self.df = DataFrame(np.random.randn(self.N, self.N)) + + def time_frame_xs(self, axis): + self.df.xs(self.N / 2, axis=axis) - def time_frame_shift_axis0(self): - self.df.shift(1, axis=0) +class SortValues(object): -class frame_shift_axis_1(object): goal_time = 0.2 + params = [True, False] + param_names = ['ascending'] - def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + def setup(self, ascending): + self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) + + def time_frame_sort_values(self, ascending): + self.df.sort_values(by='A', ascending=ascending) - def time_frame_shift_axis_1(self): - self.df.shift(1, axis=1) +class SortIndexByColumns(object): -class frame_to_html_mixed(object): goal_time = 0.2 def setup(self): - self.nrows = 500 - self.df = DataFrame(randn(self.nrows, 10)) - self.df[0] = period_range('2000', '2010', self.nrows) - self.df[1] = range(self.nrows) + N = 10000 + K = 10 + self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), + 'key2': tm.makeStringIndex(N).values.repeat(K), + 'value': np.random.randn(N * K)}) - def time_frame_to_html_mixed(self): - self.df.to_html() + def time_frame_sort_values_by_columns(self): + self.df.sort_values(by=['key1', 'key2']) -class frame_to_string_floats(object): +class Quantile(object): + goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(100, 10)) + def setup(self, axis): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - def time_frame_to_string_floats(self): - self.df.to_string() + def time_frame_quantile(self, axis): + self.df.quantile([0.1, 0.5], axis=axis) -class frame_xs_col(object): +class GetDtypeCounts(object): + # 2807 goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(1, 100000)) + self.df = DataFrame(np.random.randn(10, 10000)) - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) + def time_frame_get_dtype_counts(self): + self.df.get_dtype_counts() + def time_info(self): + self.df.info() -class frame_xs_row(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(100000, 1)) +class NSort(object): - def time_frame_xs_row(self): - self.df.xs(50000) + goal_time = 0.2 + params = ['first', 'last', 'all'] + param_names = ['keep'] + def setup(self, keep): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) -class frame_sort_index(object): - goal_time = 0.2 + def time_nlargest(self, keep): + self.df.nlargest(100, 'A', keep=keep) - def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + def time_nsmallest(self, keep): + self.df.nsmallest(100, 'A', keep=keep) - def time_frame_sort_index(self): - self.df.sort_index() +class Describe(object): -class series_string_vector_slice(object): goal_time = 0.2 def setup(self): - self.s = Series((['abcdefg', np.nan] * 500000)) + self.df = DataFrame({ + 'a': np.random.randint(0, 100, int(1e6)), + 'b': np.random.randint(0, 100, int(1e6)), + 'c': np.random.randint(0, 100, int(1e6)) + }) + + def time_series_describe(self): + self.df['a'].describe() - def time_series_string_vector_slice(self): - self.s.str[:5] + def time_dataframe_describe(self): + self.df.describe() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 4b82781fc39d9..21c1ccf46e1c4 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,5 +1,17 @@ -from .pandas_vb_common import * -from pandas.core import common as com +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, read_csv, factorize, date_range +from pandas.core.algorithms import take_1d +try: + from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, + rolling_var, rolling_skew, rolling_kurt, rolling_std) + have_rolling_methods = True +except ImportError: + have_rolling_methods = False +try: + from pandas._libs import algos +except ImportError: + from pandas import algos try: from pandas.util.testing import test_parallel have_real_test_parallel = True @@ -7,316 +19,257 @@ have_real_test_parallel = False def test_parallel(num_threads=1): - def wrapper(fname): return fname return wrapper +from .pandas_vb_common import BaseIO, setup # noqa -class nogil_groupby_count_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError - def time_nogil_groupby_count_2(self): - self.pg2() +class ParallelGroupbyMethods(object): - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].count() - - -class nogil_groupby_last_2(object): goal_time = 0.2 + params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod', + 'sum', 'var']) + param_names = ['threads', 'method'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, threads, method): + if not have_real_test_parallel: raise NotImplementedError + N = 10**6 + ngroups = 10**3 + df = DataFrame({'key': np.random.randint(0, ngroups, size=N), + 'data': np.random.randn(N)}) - def time_nogil_groupby_last_2(self): - self.pg2() - - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].last() + @test_parallel(num_threads=threads) + def parallel(): + getattr(df.groupby('key')['data'], method)() + self.parallel = parallel + def loop(): + getattr(df.groupby('key')['data'], method)() + self.loop = loop -class nogil_groupby_max_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError + def time_parallel(self, threads, method): + self.parallel() - def time_nogil_groupby_max_2(self): - self.pg2() + def time_loop(self, threads, method): + for i in range(threads): + self.loop() - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].max() +class ParallelGroups(object): -class nogil_groupby_mean_2(object): goal_time = 0.2 + params = [2, 4, 8] + param_names = ['threads'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, threads): + if not have_real_test_parallel: raise NotImplementedError + size = 2**22 + ngroups = 10**3 + data = Series(np.random.randint(0, ngroups, size=size)) - def time_nogil_groupby_mean_2(self): - self.pg2() - - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].mean() - - -class nogil_groupby_min_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError + @test_parallel(num_threads=threads) + def get_groups(): + data.groupby(data).groups + self.get_groups = get_groups - def time_nogil_groupby_min_2(self): - self.pg2() + def time_get_groups(self, threads): + self.get_groups() - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].min() +class ParallelTake1D(object): -class nogil_groupby_prod_2(object): goal_time = 0.2 + params = ['int64', 'float64'] + param_names = ['dtype'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, dtype): + if not have_real_test_parallel: raise NotImplementedError + N = 10**6 + df = DataFrame({'col': np.arange(N, dtype=dtype)}) + indexer = np.arange(100, len(df) - 100) - def time_nogil_groupby_prod_2(self): - self.pg2() - - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].prod() - - -class nogil_groupby_sum_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError + @test_parallel(num_threads=2) + def parallel_take1d(): + take_1d(df['col'].values, indexer) + self.parallel_take1d = parallel_take1d - def time_nogil_groupby_sum_2(self): - self.pg2() + def time_take1d(self, dtype): + self.parallel_take1d() - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].sum() +class ParallelKth(object): -class nogil_groupby_sum_4(object): - goal_time = 0.2 + number = 1 + repeat = 5 def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + if not have_real_test_parallel: raise NotImplementedError + N = 10**7 + k = 5 * 10**5 + kwargs_list = [{'arr': np.random.randn(N)}, + {'arr': np.random.randn(N)}] - def time_nogil_groupby_sum_4(self): - self.pg4() - - def f(self): - self.df.groupby('key')['data'].sum() - - def g2(self): - for i in range(2): - self.f() + @test_parallel(num_threads=2, kwargs_list=kwargs_list) + def parallel_kth_smallest(arr): + algos.kth_smallest(arr, k) + self.parallel_kth_smallest = parallel_kth_smallest - def g4(self): - for i in range(4): - self.f() + def time_kth_smallest(self): + self.parallel_kth_smallest() - def g8(self): - for i in range(8): - self.f() - @test_parallel(num_threads=2) - def pg2(self): - self.f() +class ParallelDatetimeFields(object): - @test_parallel(num_threads=4) - def pg4(self): - self.f() - - @test_parallel(num_threads=8) - def pg8(self): - self.f() - - -class nogil_groupby_sum_8(object): goal_time = 0.2 def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + if not have_real_test_parallel: raise NotImplementedError + N = 10**6 + self.dti = date_range('1900-01-01', periods=N, freq='T') + self.period = self.dti.to_period('D') + + def time_datetime_field_year(self): + @test_parallel(num_threads=2) + def run(dti): + dti.year + run(self.dti) + + def time_datetime_field_day(self): + @test_parallel(num_threads=2) + def run(dti): + dti.day + run(self.dti) + + def time_datetime_field_daysinmonth(self): + @test_parallel(num_threads=2) + def run(dti): + dti.days_in_month + run(self.dti) + + def time_datetime_field_normalize(self): + @test_parallel(num_threads=2) + def run(dti): + dti.normalize() + run(self.dti) + + def time_datetime_to_period(self): + @test_parallel(num_threads=2) + def run(dti): + dti.to_period('S') + run(self.dti) + + def time_period_to_datetime(self): + @test_parallel(num_threads=2) + def run(period): + period.to_timestamp() + run(self.period) + + +class ParallelRolling(object): - def time_nogil_groupby_sum_8(self): - self.pg8() - - def f(self): - self.df.groupby('key')['data'].sum() - - def g2(self): - for i in range(2): - self.f() - - def g4(self): - for i in range(4): - self.f() - - def g8(self): - for i in range(8): - self.f() - - @test_parallel(num_threads=2) - def pg2(self): - self.f() - - @test_parallel(num_threads=4) - def pg4(self): - self.f() - - @test_parallel(num_threads=8) - def pg8(self): - self.f() - - -class nogil_groupby_var_2(object): goal_time = 0.2 + params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] + param_names = ['method'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, method): + if not have_real_test_parallel: + raise NotImplementedError + win = 100 + arr = np.random.rand(100000) + if hasattr(DataFrame, 'rolling'): + df = DataFrame(arr).rolling(win) + + @test_parallel(num_threads=2) + def parallel_rolling(): + getattr(df, method)() + self.parallel_rolling = parallel_rolling + elif have_rolling_methods: + rolling = {'median': rolling_median, + 'mean': rolling_mean, + 'min': rolling_min, + 'max': rolling_max, + 'var': rolling_var, + 'skew': rolling_skew, + 'kurt': rolling_kurt, + 'std': rolling_std} + + @test_parallel(num_threads=2) + def parallel_rolling(): + rolling[method](arr, win) + self.parallel_rolling = parallel_rolling + else: raise NotImplementedError - def time_nogil_groupby_var_2(self): - self.pg2() + def time_rolling(self, method): + self.parallel_rolling() - @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].var() +class ParallelReadCSV(BaseIO): -class nogil_take1d_float64(object): - goal_time = 0.2 + number = 1 + repeat = 5 + params = ['float', 'object', 'datetime'] + param_names = ['dtype'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, dtype): + if not have_real_test_parallel: raise NotImplementedError - self.N = 10000000.0 - self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) - self.indexer = np.arange(100, (len(self.df) - 100)) + rows = 10000 + cols = 50 + data = {'float': DataFrame(np.random.randn(rows, cols)), + 'datetime': DataFrame(np.random.randn(rows, cols), + index=date_range('1/1/2000', + periods=rows)), + 'object': DataFrame('foo', + index=range(rows), + columns=['object%03d'.format(i) + for i in range(5)])} - def time_nogil_take1d_float64(self): - self.take_1d_pg2_int64() + self.fname = '__test_{}__.csv'.format(dtype) + df = data[dtype] + df.to_csv(self.fname) - @test_parallel(num_threads=2) - def take_1d_pg2_int64(self): - com.take_1d(self.df.int64.values, self.indexer) + @test_parallel(num_threads=2) + def parallel_read_csv(): + read_csv(self.fname) + self.parallel_read_csv = parallel_read_csv - @test_parallel(num_threads=2) - def take_1d_pg2_float64(self): - com.take_1d(self.df.float64.values, self.indexer) + def time_read_csv(self, dtype): + self.parallel_read_csv() -class nogil_take1d_int64(object): - goal_time = 0.2 +class ParallelFactorize(object): - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError - self.N = 10000000.0 - self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) - self.indexer = np.arange(100, (len(self.df) - 100)) + number = 1 + repeat = 5 + params = [2, 4, 8] + param_names = ['threads'] - def time_nogil_take1d_int64(self): - self.take_1d_pg2_float64() + def setup(self, threads): + if not have_real_test_parallel: + raise NotImplementedError - @test_parallel(num_threads=2) - def take_1d_pg2_int64(self): - com.take_1d(self.df.int64.values, self.indexer) + strings = tm.makeStringIndex(100000) - @test_parallel(num_threads=2) - def take_1d_pg2_float64(self): - com.take_1d(self.df.float64.values, self.indexer) + @test_parallel(num_threads=threads) + def parallel(): + factorize(strings) + self.parallel = parallel + def loop(): + factorize(strings) + self.loop = loop -class nogil_kth_smallest(object): - number = 1 - repeat = 5 + def time_parallel(self, threads): + self.parallel() - def setup(self): - if (not have_real_test_parallel): - raise NotImplementedError - np.random.seed(1234) - self.N = 10000000 - self.k = 500000 - self.a = np.random.randn(self.N) - self.b = self.a.copy() - self.kwargs_list = [{'arr': self.a}, {'arr': self.b}] - - def time_nogil_kth_smallest(self): - @test_parallel(num_threads=2, kwargs_list=self.kwargs_list) - def run(arr): - algos.kth_smallest(arr, self.k) - run() + def time_loop(self, threads): + for i in range(threads): + self.loop() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 138977a29463e..0725bbeb6c36d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,774 +1,581 @@ -from .pandas_vb_common import * -from string import ascii_letters, digits +import warnings +from string import ascii_letters from itertools import product +from functools import partial +import numpy as np +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + TimeGrouper, Categorical, Timestamp) +import pandas.util.testing as tm -class groupby_agg_builtins(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(27182) - self.n = 100000 - self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) +from .pandas_vb_common import setup # noqa - def time_groupby_agg_builtins1(self): - self.df.groupby('jim').agg([sum, min, max]) - def time_groupby_agg_builtins2(self): - self.df.groupby(['jim', 'joe']).agg([sum, min, max]) +method_blacklist = { + 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', + 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', + 'var', 'mad', 'describe', 'std'}, + 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', + 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', + 'std'} +} -#---------------------------------------------------------------------- -# dict return values -class groupby_apply_dict_return(object): +class ApplyDictReturn(object): goal_time = 0.2 def setup(self): self.labels = np.arange(1000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], }) + self.data = Series(np.random.randn(len(self.labels))) def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(self.f) + self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0], + 'last': x.values[-1]}) -#---------------------------------------------------------------------- -# First / last functions +class Apply(object): -class groupby_first_last(object): goal_time = 0.2 - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) - - def time_groupby_first_float32(self): - self.data2.groupby(self.labels).first() + def setup_cache(self): + N = 10**4 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4) + }) + return df - def time_groupby_first_float64(self): - self.data.groupby(self.labels).first() + def time_scalar_function_multi_col(self, df): + df.groupby(['key', 'key2']).apply(lambda x: 1) - def time_groupby_last_float32(self): - self.data2.groupby(self.labels).last() + def time_scalar_function_single_col(self, df): + df.groupby('key').apply(lambda x: 1) - def time_groupby_last_float64(self): - self.data.groupby(self.labels).last() + @staticmethod + def df_copy_function(g): + # ensure that the group name is available (see GH #15062) + g.name + return g.copy() - def time_groupby_nth_float32_any(self): - self.data2.groupby(self.labels).nth(0, dropna='all') + def time_copy_function_multi_col(self, df): + df.groupby(['key', 'key2']).apply(self.df_copy_function) - def time_groupby_nth_float32_none(self): - self.data2.groupby(self.labels).nth(0) + def time_copy_overhead_single_col(self, df): + df.groupby('key').apply(self.df_copy_function) - def time_groupby_nth_float64_any(self): - self.data.groupby(self.labels).nth(0, dropna='all') - def time_groupby_nth_float64_none(self): - self.data.groupby(self.labels).nth(0) +class Groups(object): -# with datetimes (GH7555) - -class groupby_first_last_datetimes(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + param_names = ['key'] + params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def time_groupby_first_datetimes(self): - self.df.groupby('b').first() + def setup_cache(self): + size = 10**6 + data = {'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large': Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series( + tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=size))), + 'object_large': Series( + tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=size)))} + return data - def time_groupby_last_datetimes(self): - self.df.groupby('b').last() + def setup(self, data, key): + self.ser = data[key] - def time_groupby_nth_datetimes_any(self): - self.df.groupby('b').nth(0, dropna='all') + def time_series_groups(self, data, key): + self.ser.groupby(self.ser).groups - def time_groupby_nth_datetimes_none(self): - self.df.groupby('b').nth(0) +class GroupManyLabels(object): -class groupby_first_last_object(object): goal_time = 0.2 + params = [1, 1000] + param_names = ['ncols'] - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000)}) - - def time_groupby_first_object(self): - self.df.groupby('b').first() + def setup(self, ncols): + N = 1000 + data = np.random.randn(N, ncols) + self.labels = np.random.randint(0, 100, size=N) + self.df = DataFrame(data) - def time_groupby_last_object(self): - self.df.groupby('b').last() - - def time_groupby_nth_object_any(self): - self.df.groupby('b').nth(0, dropna='any') - - def time_groupby_nth_object_none(self): - self.df.groupby('b').nth(0) + def time_sum(self, ncols): + self.df.groupby(self.labels).sum() -#---------------------------------------------------------------------- -# DataFrame Apply overhead +class Nth(object): -class groupby_frame_apply(object): goal_time = 0.2 - def setup(self): - self.N = 10000 - self.labels = np.random.randint(0, 2000, size=self.N) - self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def f(self, g): - return 1 + def setup(self, dtype): + N = 10**5 + # with datetimes (GH7555) + if dtype == 'datetime': + values = date_range('1/1/2011', periods=N, freq='s') + elif dtype == 'object': + values = ['foo'] * N + else: + values = np.arange(N).astype(dtype) - def time_groupby_frame_apply(self): - self.df.groupby(['key', 'key2']).apply(self.f) + key = np.arange(N) + self.df = DataFrame({'key': key, 'values': values}) + self.df.iloc[1, 1] = np.nan # insert missing data - def time_groupby_frame_apply_overhead(self): - self.df.groupby('key').apply(self.f) + def time_frame_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='any') + def time_groupby_nth_all(self, dtype): + self.df.groupby('key').nth(0, dropna='all') -#---------------------------------------------------------------------- -# 2d grouping, aggregate many columns + def time_frame_nth(self, dtype): + self.df.groupby('key').nth(0) -class groupby_frame_cython_many_columns(object): - goal_time = 0.2 + def time_series_nth_any(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='any') - def setup(self): - self.labels = np.random.randint(0, 100, size=1000) - self.df = DataFrame(randn(1000, 1000)) + def time_groupby_nth_all(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='all') - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_series_nth(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0) -#---------------------------------------------------------------------- -# single key, long, integer key +class DateAttributes(object): -class groupby_frame_singlekey_integer(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100000, 1) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) + rng = date_range('1/1/2000', '12/31/2005', freq='H') + self.year, self.month, self.day = rng.year, rng.month, rng.day + self.ts = Series(np.random.randn(len(rng)), index=rng) - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_len_groupby_object(self): + len(self.ts.groupby([self.year, self.month, self.day])) -#---------------------------------------------------------------------- -# median +class Int64(object): -class groupby_frame(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100000, 2) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) - - def time_groupby_frame_median(self): - self.df.groupby(self.labels).median() + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) + i = np.random.choice(len(arr), len(arr) * 5) + arr = np.vstack((arr, arr[i])) + i = np.random.permutation(len(arr)) + arr = arr[i] + self.cols = list('abcde') + self.df = DataFrame(arr, columns=self.cols) + self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 - def time_groupby_simple_compress_timing(self): - self.df.groupby(self.labels).mean() + def time_overflow(self): + self.df.groupby(self.cols).max() -#---------------------------------------------------------------------- -# DataFrame nth +class CountMultiDtype(object): -class groupby_nth(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) - - def time_groupby_frame_nth_any(self): - self.df.groupby(0).nth(0, dropna='any') - - def time_groupby_frame_nth_none(self): - self.df.groupby(0).nth(0) - - def time_groupby_series_nth_any(self): - self.df[1].groupby(self.df[0]).nth(0, dropna='any') + def setup_cache(self): + n = 10000 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64('nat') + offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + value2 = np.random.randn(n) + value2[np.random.rand(n) > 0.5] = np.nan + obj = np.random.choice(list('ab'), size=n).astype(object) + obj[np.random.randn(n) > 0.5] = np.nan + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'dates': dates, + 'value2': value2, + 'value3': np.random.randn(n), + 'ints': np.random.randint(0, 1000, size=n), + 'obj': obj, + 'offsets': offsets}) + return df + + def time_multi_count(self, df): + df.groupby(['key1', 'key2']).count() + + +class CountMultiInt(object): - def time_groupby_series_nth_none(self): - self.df[1].groupby(self.df[0]).nth(0) - - -#---------------------------------------------------------------------- -# groupby_indices replacement, chop up Series - -class groupby_indices(object): goal_time = 0.2 - def setup(self): - try: - self.rng = date_range('1/1/2000', '12/31/2005', freq='H') - (self.year, self.month, self.day) = (self.rng.year, self.rng.month, self.rng.day) - except: - self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) - self.year = self.rng.map((lambda x: x.year)) - self.month = self.rng.map((lambda x: x.month)) - self.day = self.rng.map((lambda x: x.day)) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_groupby_indices(self): - len(self.ts.groupby([self.year, self.month, self.day])) - + def setup_cache(self): + n = 10000 + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'ints': np.random.randint(0, 1000, size=n), + 'ints2': np.random.randint(0, 1000, size=n)}) + return df -class groupby_int64_overflow(object): - goal_time = 0.2 + def time_multi_int_count(self, df): + df.groupby(['key1', 'key2']).count() - def setup(self): - self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) - self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) - self.arr = np.vstack((self.arr, self.arr[self.i])) - self.i = np.random.permutation(len(self.arr)) - self.arr = self.arr[self.i] - self.df = DataFrame(self.arr, columns=list('abcde')) - (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10) + def time_multi_int_nunique(self, df): + df.groupby(['key1', 'key2']).nunique() - def time_groupby_int64_overflow(self): - self.df.groupby(list('abcde')).max() +class AggFunctions(object): -#---------------------------------------------------------------------- -# count() speed - -class groupby_multi_count(object): - goal_time = 0.2 - - def setup(self): - self.n = 10000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') - self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') - self.value2 = np.random.randn(self.n) - self.value2[(np.random.rand(self.n) > 0.5)] = np.nan - self.obj = tm.choice(list('ab'), size=self.n).astype(object) - self.obj[(np.random.randn(self.n) > 0.5)] = np.nan - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), - 'key2': np.random.randint(0, 100, size=self.n), - 'dates': self.dates, - 'value2': self.value2, - 'value3': np.random.randn(self.n), - 'ints': np.random.randint(0, 1000, size=self.n), - 'obj': self.obj, - 'offsets': self.offsets, }) - - def time_groupby_multi_count(self): - self.df.groupby(['key1', 'key2']).count() - - -class groupby_int_count(object): goal_time = 0.2 - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) - - def time_groupby_int_count(self): - self.df.groupby(['key1', 'key2']).count() + def setup_cache(): + N = 10**5 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), + 'key2': fac2.take(np.random.randint(0, 2, size=N)), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + return df + def time_different_str_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': 'mean', + 'value2': 'var', + 'value3': 'sum'}) -#---------------------------------------------------------------------- -# group with different functions per column - -class groupby_agg_multi(object): - goal_time = 0.2 + def time_different_numpy_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': np.mean, + 'value2': np.var, + 'value3': np.sum}) - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + def time_different_python_functions_multicol(self, df): + df.groupby(['key1', 'key2']).agg([sum, min, max]) - def time_groupby_multi_different_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum'}) + def time_different_python_functions_singlecol(self, df): + df.groupby('key1').agg([sum, min, max]) - def time_groupby_multi_different_numpy_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum}) +class GroupStrings(object): -class groupby_multi_index(object): goal_time = 0.2 def setup(self): - self.n = (((5 * 7) * 11) * (1 << 9)) - self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) - self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k)) - self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), }) + n = 2 * 10**5 + alpha = list(map(''.join, product(ascii_letters, repeat=4))) + data = np.random.choice(alpha, (n // 5, 4), replace=False) + data = np.repeat(data, 5, axis=0) + self.df = DataFrame(data, columns=list('abcd')) self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) - self.i = np.random.permutation(len(self.df)) - self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + self.df = self.df.sample(frac=1).reset_index(drop=True) - def time_groupby_multi_index(self): + def time_multi_columns(self): self.df.groupby(list('abcd')).max() -class groupby_multi(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr +class MultiColumn(object): - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + goal_time = 0.2 - def time_groupby_multi_cython(self): - self.df.groupby(['key1', 'key2']).sum() + def setup_cache(self): + N = 10**5 + key1 = np.tile(np.arange(100, dtype=object), 1000) + key2 = key1.copy() + np.random.shuffle(key1) + np.random.shuffle(key2) + df = DataFrame({'key1': key1, + 'key2': key2, + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + return df - def time_groupby_multi_python(self): - self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) + def time_lambda_sum(self, df): + df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) - def time_groupby_multi_series_op(self): - self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) + def time_cython_sum(self, df): + df.groupby(['key1', 'key2']).sum() - def time_groupby_series_simple_cython(self): - self.simple_series.groupby(self.key1).sum() + def time_col_select_lambda_sum(self, df): + df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum()) - def time_groupby_series_simple_rank(self): - self.df.groupby('key1').rank(pct=True) + def time_col_select_numpy_sum(self, df): + df.groupby(['key1', 'key2'])['data1'].agg(np.sum) -#---------------------------------------------------------------------- -# size() speed +class Size(object): -class groupby_size(object): goal_time = 0.2 def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) - - def time_groupby_multi_size(self): + n = 10**5 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'value1': np.random.randn(n), + 'value2': np.random.randn(n), + 'value3': np.random.randn(n), + 'dates': dates}) + self.draws = Series(np.random.randn(n)) + labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) + self.cats = labels.astype('category') + + def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_groupby_dt_size(self): - self.df.groupby(['dates']).size() + def time_dt_timegrouper_size(self): + with warnings.catch_warnings(record=True): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_groupby_dt_timegrouper_size(self): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_category_size(self): + self.draws.groupby(self.cats).size() -#---------------------------------------------------------------------- -# groupby with a variable value for ngroups +class GroupByMethods(object): -class groupby_ngroups_10000(object): goal_time = 0.2 - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_all(self): - self.df.groupby('value')['timestamp'].all() - - def time_any(self): - self.df.groupby('value')['timestamp'].any() - - def time_count(self): - self.df.groupby('value')['timestamp'].count() - - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() - - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() - - def time_first(self): - self.df.groupby('value')['timestamp'].first() - - def time_head(self): - self.df.groupby('value')['timestamp'].head() - - def time_last(self): - self.df.groupby('value')['timestamp'].last() - - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() - - def time_max(self): - self.df.groupby('value')['timestamp'].max() - - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() - - def time_median(self): - self.df.groupby('value')['timestamp'].median() - - def time_min(self): - self.df.groupby('value')['timestamp'].min() - - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() - - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() - - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() - - def time_size(self): - self.df.groupby('value')['timestamp'].size() - - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() - - def time_std(self): - self.df.groupby('value')['timestamp'].std() - - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() - - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() - - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() + param_names = ['dtype', 'method', 'application'] + params = [['int', 'float', 'object', 'datetime'], + ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', + 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', + 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', + 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + ['direct', 'transformation']] + + def setup(self, dtype, method, application): + if method in method_blacklist.get(dtype, {}): + raise NotImplementedError # skip benchmark + ngroups = 1000 + size = ngroups * 2 + rng = np.arange(ngroups) + values = rng.take(np.random.randint(0, ngroups, size=size)) + if dtype == 'int': + key = np.random.randint(0, size, size=size) + elif dtype == 'float': + key = np.concatenate([np.random.random(ngroups) * 0.1, + np.random.random(ngroups) * 10.0]) + elif dtype == 'object': + key = ['foo'] * size + elif dtype == 'datetime': + key = date_range('1/1/2011', periods=size, freq='s') + + df = DataFrame({'values': values, 'key': key}) + + if application == 'transform': + if method == 'describe': + raise NotImplementedError + + self.as_group_method = lambda: df.groupby( + 'key')['values'].transform(method) + self.as_field_method = lambda: df.groupby( + 'values')['key'].transform(method) + else: + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) + + def time_dtype_as_group(self, dtype, method, application): + self.as_group_method() + + def time_dtype_as_field(self, dtype, method, application): + self.as_field_method() + + +class RankWithTies(object): + # GH 21237 + goal_time = 0.2 + param_names = ['dtype', 'tie_method'] + params = [['float64', 'float32', 'int64', 'datetime64'], + ['first', 'average', 'dense', 'min', 'max']] - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def setup(self, dtype, tie_method): + N = 10**4 + if dtype == 'datetime64': + data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) + else: + data = np.array([1] * N, dtype=dtype) + self.df = DataFrame({'values': data, 'key': ['foo'] * N}) - def time_var(self): - self.df.groupby('value')['timestamp'].var() + def time_rank_ties(self, dtype, tie_method): + self.df.groupby('key').rank(method=tie_method) -class groupby_ngroups_100(object): +class Float32(object): + # GH 13335 goal_time = 0.2 def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_all(self): - self.df.groupby('value')['timestamp'].all() - - def time_any(self): - self.df.groupby('value')['timestamp'].any() - - def time_count(self): - self.df.groupby('value')['timestamp'].count() - - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() - - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() - - def time_first(self): - self.df.groupby('value')['timestamp'].first() - - def time_head(self): - self.df.groupby('value')['timestamp'].head() - - def time_last(self): - self.df.groupby('value')['timestamp'].last() - - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() - - def time_max(self): - self.df.groupby('value')['timestamp'].max() - - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() - - def time_median(self): - self.df.groupby('value')['timestamp'].median() - - def time_min(self): - self.df.groupby('value')['timestamp'].min() - - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() - - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() - - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() - - def time_size(self): - self.df.groupby('value')['timestamp'].size() - - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() - - def time_std(self): - self.df.groupby('value')['timestamp'].std() + tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) + tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) + tmp = np.concatenate((tmp1, tmp2)) + arr = np.repeat(tmp, 10) + self.df = DataFrame(dict(a=arr, b=arr)) def time_sum(self): - self.df.groupby('value')['timestamp'].sum() - - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() - - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() + self.df.groupby(['a'])['b'].sum() - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() - def time_var(self): - self.df.groupby('value')['timestamp'].var() +class Categories(object): - -#---------------------------------------------------------------------- -# Series.value_counts - -class series_value_counts(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)) - self.s2 = self.s.astype(float) + N = 10**5 + arr = np.random.random(N) + data = {'a': Categorical(np.random.randint(10000, size=N)), + 'b': arr} + self.df = DataFrame(data) + data = {'a': Categorical(np.random.randint(10000, size=N), + ordered=True), + 'b': arr} + self.df_ordered = DataFrame(data) + data = {'a': Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + 'b': arr} + self.df_extra_cat = DataFrame(data) - self.K = 1000 - self.N = 100000 - self.uniques = tm.makeStringIndex(self.K).values - self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) + def time_groupby_sort(self): + self.df.groupby('a')['b'].count() - def time_value_counts_int64(self): - self.s.value_counts() + def time_groupby_nosort(self): + self.df.groupby('a', sort=False)['b'].count() - def time_value_counts_float64(self): - self.s2.value_counts() + def time_groupby_ordered_sort(self): + self.df_ordered.groupby('a')['b'].count() - def time_value_counts_strings(self): - self.s.value_counts() + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby('a', sort=False)['b'].count() + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby('a')['b'].count() -#---------------------------------------------------------------------- -# pivot_table + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby('a', sort=False)['b'].count() -class groupby_pivot_table(object): - goal_time = 0.2 - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.ind1 = np.random.randint(0, 3, size=100000) - self.ind2 = np.random.randint(0, 2, size=100000) - self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) +class Datelike(object): + # GH 14338 + goal_time = 0.2 + params = ['period_range', 'date_range', 'date_range_tz'] + param_names = ['grouper'] - def time_groupby_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) + def setup(self, grouper): + N = 10**4 + rng_map = {'period_range': period_range, + 'date_range': date_range, + 'date_range_tz': partial(date_range, tz='US/Central')} + self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) + self.df = DataFrame(np.random.randn(10**4, 2)) + def time_sum(self, grouper): + self.df.groupby(self.grouper).sum() -#---------------------------------------------------------------------- -# Sum booleans #2692 -class groupby_sum_booleans(object): +class SumBools(object): + # GH 2692 goal_time = 0.2 def setup(self): - self.N = 500 - self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], }) + N = 500 + self.df = DataFrame({'ii': range(N), + 'bb': [True] * N}) def time_groupby_sum_booleans(self): self.df.groupby('ii').sum() -#---------------------------------------------------------------------- -# multi-indexed group sum #9049 - -class groupby_sum_multiindex(object): +class SumMultiLevel(object): + # GH 9049 goal_time = 0.2 + timeout = 120.0 def setup(self): - self.N = 50 - self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B']) + N = 50 + self.df = DataFrame({'A': list(range(N)) * 2, + 'B': range(N * 2), + 'C': 1}).set_index(['A', 'B']) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() -#------------------------------------------------------------------------------- -# Transform testing - -class groupby_transform(object): - goal_time = 0.2 - - def setup(self): - self.n_dates = 400 - self.n_securities = 250 - self.n_columns = 3 - self.share_na = 0.1 - self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') - self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) - self.secid_min = int('10000000', 16) - self.secid_max = int('F0000000', 16) - self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) - self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) - self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], - labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)], - names=['date', 'security_id']) - self.n_data = len(self.data_index) - self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) - self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) - self.step = int((self.n_data * self.share_na)) - for column_index in range(self.n_columns): - self.index = column_index - while (self.index < self.n_data): - self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) - self.index += self.step - self.f_fillna = (lambda x: x.fillna(method='pad')) - - def time_groupby_transform(self): - self.data.groupby(level='security_id').transform(self.f_fillna) - - def time_groupby_transform_ufunc(self): - self.data.groupby(level='date').transform(np.max) - - -class groupby_transform_multi_key(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) - - def time_groupby_transform_multi_key1(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') - +class Transform(object): -class groupby_transform_multi_key2(object): goal_time = 0.2 def setup(self): - np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] + n1 = 400 + n2 = 250 + index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], + labels=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], + names=['lev1', 'lev2']) + arr = np.random.randn(n1 * n2, 3) + arr[::10000, 0] = np.nan + arr[1::10000, 1] = np.nan + arr[2::10000, 2] = np.nan + data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) + self.df = data - def time_groupby_transform_multi_key2(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + n = 20000 + self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df2 = self.df1.copy() + self.df2['jim'] = self.df2['joe'] + self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df4 = self.df3.copy() + self.df4['jim'] = self.df4['joe'] -class groupby_transform_multi_key3(object): - goal_time = 0.2 + def time_transform_lambda_max(self): + self.df.groupby(level='lev1').transform(lambda x: max(x)) - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + def time_transform_ufunc_max(self): + self.df.groupby(level='lev1').transform(np.max) - def time_groupby_transform_multi_key3(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key1(self): + self.df1.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key2(self): + self.df2.groupby(['jim', 'joe'])['jolie'].transform('max') -class groupby_transform_multi_key4(object): - goal_time = 0.2 + def time_transform_multi_key3(self): + self.df3.groupby(['jim', 'joe'])['jolie'].transform('max') - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] + def time_transform_multi_key4(self): + self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') - def time_groupby_transform_multi_key4(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') +class TransformBools(object): -class groupby_transform_series(object): goal_time = 0.2 def setup(self): - np.random.seed(0) - self.N = 120000 - self.N_TRANSITIONS = 1400 - self.transition_points = np.random.permutation(np.arange(self.N))[:self.N_TRANSITIONS] - self.transition_points.sort() - self.transitions = np.zeros((self.N,), dtype=np.bool) - self.transitions[self.transition_points] = True - self.g = self.transitions.cumsum() - self.df = DataFrame({'signal': np.random.rand(self.N), }) - - def time_groupby_transform_series(self): + N = 120000 + transition_points = np.sort(np.random.choice(np.arange(N), 1400)) + transitions = np.zeros(N, dtype=np.bool) + transitions[transition_points] = True + self.g = transitions.cumsum() + self.df = DataFrame({'signal': np.random.rand(N)}) + + def time_transform_mean(self): self.df['signal'].groupby(self.g).transform(np.mean) -class groupby_transform_series2(object): +class TransformNaN(object): + # GH 12737 goal_time = 0.2 def setup(self): - np.random.seed(0) - self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.loc[4::10, 'B':'C'] = 5 - def time_groupby_transform_series2(self): - self.df.groupby('id')['val'].transform(np.mean) + def time_first(self): + self.df_nans.groupby('key').transform('first') diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py deleted file mode 100644 index 7638cc2a0f8df..0000000000000 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ /dev/null @@ -1,351 +0,0 @@ -from .pandas_vb_common import * -import os - - -class query_store_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df12', self.df) - - def time_query_store_table(self): - self.store.select('df12', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class query_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame(np.random.randn(25000, 100), index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df11', self.df) - - def time_query_store_table_wide(self): - self.store.select('df11', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.put('df1', self.df) - - def time_read_store(self): - self.store.get('df1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.put('df3', self.df) - - def time_read_store_mixed(self): - self.store.get('df3') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df7', self.df) - - def time_read_store_table(self): - self.store.select('df7') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.N = 10000 - self.index = tm.makeStringIndex(self.N) - self.df = DataFrame({'float1': randn(self.N), 'float2': randn(self.N), 'string1': (['foo'] * self.N), 'bool1': ([True] * self.N), 'int1': np.random.randint(0, self.N, size=self.N), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df5', self.df) - - def time_read_store_table_mixed(self): - self.store.select('df5') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('p1', self.p) - - def time_read_store_table_panel(self): - self.store.select('p1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df9', self.df) - - def time_read_store_table_wide(self): - self.store.select('df9') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store(self): - self.store.put('df2', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_mixed(self): - self.store.put('df4', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table(self): - self.store.append('df8', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_dc(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(10000, 10), columns=[('C%03d' % i) for i in range(10)]) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_dc(self): - self.store.append('df15', self.df, data_columns=True) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 25000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_mixed(self): - self.store.append('df6', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_panel(self): - self.store.append('p2', self.p) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_wide(self): - self.store.append('df10', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass \ No newline at end of file diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 8c65f09937df4..f1703e163917a 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,292 +1,194 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, + Float64Index) +from .pandas_vb_common import setup # noqa -class datetime_index_intersection(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] - - def time_datetime_index_intersection(self): - self.rng.intersection(self.rng2) - - -class datetime_index_repr(object): - goal_time = 0.2 - - def setup(self): - self.dr = pd.date_range('20000101', freq='D', periods=100000) - - def time_datetime_index_repr(self): - self.dr._is_dates_only - - -class datetime_index_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] - - def time_datetime_index_union(self): - self.rng.union(self.rng2) +class SetOperations(object): -class index_datetime_intersection(object): goal_time = 0.2 + params = (['datetime', 'date_string', 'int', 'strings'], + ['intersection', 'union', 'symmetric_difference']) + param_names = ['dtype', 'method'] - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] - - def time_index_datetime_intersection(self): - self.rng.intersection(self.rng2) - + def setup(self, dtype, method): + N = 10**5 + dates_left = date_range('1/1/2000', periods=N, freq='T') + fmt = '%Y-%m-%d %H:%M:%S' + date_str_left = Index(dates_left.strftime(fmt)) + int_left = Index(np.arange(N)) + str_left = tm.makeStringIndex(N) + data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]}, + 'date_string': {'left': date_str_left, + 'right': date_str_left[:-1]}, + 'int': {'left': int_left, 'right': int_left[:-1]}, + 'strings': {'left': str_left, 'right': str_left[:-1]}} + self.left = data[dtype]['left'] + self.right = data[dtype]['right'] -class index_datetime_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] + def time_operation(self, dtype, method): + getattr(self.left, method)(self.right) - def time_index_datetime_union(self): - self.rng.union(self.rng2) +class SetDisjoint(object): -class index_float64_boolean_indexer(object): goal_time = 0.2 def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_boolean_indexer(self): - self.idx[self.mask] - + N = 10**5 + B = N + 20000 + self.datetime_left = DatetimeIndex(range(N)) + self.datetime_right = DatetimeIndex(range(N, B)) -class index_float64_boolean_series_indexer(object): - goal_time = 0.2 + def time_datetime_difference_disjoint(self): + self.datetime_left.difference(self.datetime_right) - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - def time_index_float64_boolean_series_indexer(self): - self.idx[self.series_mask] +class Datetime(object): - -class index_float64_construct(object): goal_time = 0.2 def setup(self): - self.baseidx = np.arange(1000000.0) - - def time_index_float64_construct(self): - Index(self.baseidx) + self.dr = date_range('20000101', freq='D', periods=10000) + def time_is_dates_only(self): + self.dr._is_dates_only -class index_float64_div(object): - goal_time = 0.2 - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) +class Ops(object): - def time_index_float64_div(self): - (self.idx / 2) + sample_time = 0.2 + params = ['float', 'int'] + param_names = ['dtype'] + def setup(self, dtype): + N = 10**6 + indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'} + self.index = getattr(tm, indexes[dtype])(N) -class index_float64_get(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + def time_add(self, dtype): + self.index + 2 - def time_index_float64_get(self): - self.idx[1] + def time_subtract(self, dtype): + self.index - 2 + def time_multiply(self, dtype): + self.index * 2 -class index_float64_mul(object): - goal_time = 0.2 + def time_divide(self, dtype): + self.index / 2 - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + def time_modulo(self, dtype): + self.index % 2 - def time_index_float64_mul(self): - (self.idx * 2) +class Range(object): -class index_float64_slice_indexer_basic(object): goal_time = 0.2 def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) + self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) - def time_index_float64_slice_indexer_basic(self): - self.idx[:(-1)] + def time_max(self): + self.idx_inc.max() + def time_max_trivial(self): + self.idx_dec.max() -class index_float64_slice_indexer_even(object): - goal_time = 0.2 + def time_min(self): + self.idx_dec.min() - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + def time_min_trivial(self): + self.idx_inc.min() - def time_index_float64_slice_indexer_even(self): - self.idx[::2] +class IndexAppend(object): -class index_int64_intersection(object): goal_time = 0.2 def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - def time_index_int64_intersection(self): - self.left.intersection(self.right) + N = 10000 + self.range_idx = RangeIndex(0, 100) + self.int_idx = self.range_idx.astype(int) + self.obj_idx = self.int_idx.astype(str) + self.range_idxs = [] + self.int_idxs = [] + self.object_idxs = [] + for i in range(1, N): + r_idx = RangeIndex(i * 100, (i + 1) * 100) + self.range_idxs.append(r_idx) + i_idx = r_idx.astype(int) + self.int_idxs.append(i_idx) + o_idx = i_idx.astype(str) + self.object_idxs.append(o_idx) + def time_append_range_list(self): + self.range_idx.append(self.range_idxs) -class index_int64_union(object): - goal_time = 0.2 + def time_append_int_list(self): + self.int_idx.append(self.int_idxs) - def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + def time_append_obj_list(self): + self.obj_idx.append(self.object_idxs) - def time_index_int64_union(self): - self.left.union(self.right) +class Indexing(object): -class index_str_boolean_indexer(object): goal_time = 0.2 + params = ['String', 'Float', 'Int'] + param_names = ['dtype'] - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_boolean_indexer(self): - self.idx[self.mask] - + def setup(self, dtype): + N = 10**6 + self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) + self.array_mask = (np.arange(N) % 3) == 0 + self.series_mask = Series(self.array_mask) + self.sorted = self.idx.sort_values() + half = N // 2 + self.non_unique = self.idx[:half].append(self.idx[:half]) + self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half]) + self.key = self.sorted[N // 4] -class index_str_boolean_series_indexer(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) + def time_boolean_array(self, dtype): + self.idx[self.array_mask] - def time_index_str_boolean_series_indexer(self): + def time_boolean_series(self, dtype): self.idx[self.series_mask] + def time_get(self, dtype): + self.idx[1] -class index_str_slice_indexer_basic(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_slice_indexer_basic(self): - self.idx[:(-1)] - - -class index_str_slice_indexer_even(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) + def time_slice(self, dtype): + self.idx[:-1] - def time_index_str_slice_indexer_even(self): + def time_slice_step(self, dtype): self.idx[::2] + def time_get_loc(self, dtype): + self.idx.get_loc(self.key) -class multiindex_duplicated(object): - goal_time = 0.2 - - def setup(self): - (n, k) = (200, 5000) - self.levels = [np.arange(n), tm.makeStringIndex(n).values, (1000 + np.arange(n))] - self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] - self.mi = MultiIndex(levels=self.levels, labels=self.labels) - - def time_multiindex_duplicated(self): - self.mi.duplicated() - - -class multiindex_from_product(object): - goal_time = 0.2 - - def setup(self): - self.iterables = [tm.makeStringIndex(10000), range(20)] - - def time_multiindex_from_product(self): - MultiIndex.from_product(self.iterables) + def time_get_loc_sorted(self, dtype): + self.sorted.get_loc(self.key) + def time_get_loc_non_unique(self, dtype): + self.non_unique.get_loc(self.key) -class multiindex_sortlevel_int64(object): - goal_time = 0.2 - - def setup(self): - self.n = ((((3 * 5) * 7) * 11) * (1 << 10)) - (low, high) = (((-1) << 12), (1 << 12)) - self.f = (lambda k: np.repeat(np.random.randint(low, high, (self.n // k)), k)) - self.i = np.random.permutation(self.n) - self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] - - def time_multiindex_sortlevel_int64(self): - self.mi.sortlevel() - - -class multiindex_with_datetime_level_full(object): - goal_time = 0.2 - - def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) - - def time_multiindex_with_datetime_level_full(self): - self.mi.copy().values + def time_get_loc_non_unique_sorted(self, dtype): + self.non_unique_sorted.get_loc(self.key) -class multiindex_with_datetime_level_sliced(object): +class Float64IndexMethod(object): + # GH 13166 goal_time = 0.2 def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) + N = 100000 + a = np.arange(N) + self.ind = Float64Index(a * 4.8000000418824129e-08) - def time_multiindex_with_datetime_level_sliced(self): - self.mi[:10].values \ No newline at end of file + def time_get_loc(self): + self.ind.get_loc(0) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 32d80a7913234..739ad6a3d278b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,489 +1,354 @@ -from .pandas_vb_common import * -try: - import pandas.computation.expressions as expr -except: - expr = None +import warnings +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, + IntervalIndex, CategoricalIndex, + IndexSlice, concat, date_range) +from .pandas_vb_common import setup, Panel # noqa -class dataframe_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_dataframe_getitem_scalar(self): - self.df[self.col][self.idx] - - -class datamatrix_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - try: - self.klass = DataMatrix - except: - self.klass = DataFrame - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_datamatrix_getitem_scalar(self): - self.df[self.col][self.idx] - - -class series_get_value(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000) - self.s = Series(np.random.rand(1000), index=self.index) - self.idx = self.index[100] - - def time_series_get_value(self): - self.s.get_value(self.idx) - - -class time_series_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - tm.N = 1000 - self.ts = tm.makeTimeSeries() - self.dt = self.ts.index[500] - - def time_time_series_getitem_scalar(self): - self.ts[self.dt] - - -class frame_iloc_big(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(dict(A=(['foo'] * 1000000))) - - def time_frame_iloc_big(self): - self.df.iloc[:100, 0] - - -class frame_iloc_dups(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) - - def time_frame_iloc_dups(self): - self.df2.iloc[self.idx] +class NumericSeriesIndexing(object): -class frame_loc_dups(object): goal_time = 0.2 + params = [Int64Index, Float64Index] + param = ['index'] - def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) + def setup(self, index): + N = 10**6 + idx = index(range(N)) + self.data = Series(np.random.rand(N), index=idx) + self.array = np.arange(10000) + self.array_list = self.array.tolist() - def time_frame_loc_dups(self): - self.df2.loc[self.idx] + def time_getitem_scalar(self, index): + self.data[800000] + def time_getitem_slice(self, index): + self.data[:800000] -class frame_xs_mi_ix(object): - goal_time = 0.2 + def time_getitem_list_like(self, index): + self.data[[800000]] - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) - self.df = DataFrame(self.s) + def time_getitem_array(self, index): + self.data[self.array] - def time_frame_xs_mi_ix(self): - self.df.ix[999] + def time_getitem_lists(self, index): + self.data[self.array_list] + def time_iloc_array(self, index): + self.data.iloc[self.array] -class indexing_dataframe_boolean(object): - goal_time = 0.2 + def time_iloc_list_like(self, index): + self.data.iloc[[800000]] - def setup(self): - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) + def time_iloc_scalar(self, index): + self.data.iloc[800000] - def time_indexing_dataframe_boolean(self): - (self.df > self.df2) + def time_iloc_slice(self, index): + self.data.iloc[:800000] + def time_ix_array(self, index): + self.data.ix[self.array] -class indexing_dataframe_boolean_no_ne(object): - goal_time = 0.2 + def time_ix_list_like(self, index): + self.data.ix[[800000]] - def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_use_numexpr(False) + def time_ix_scalar(self, index): + self.data.ix[800000] - def time_indexing_dataframe_boolean_no_ne(self): - (self.df > self.df2) + def time_ix_slice(self, index): + self.data.ix[:800000] - def teardown(self): - expr.set_use_numexpr(True) + def time_loc_array(self, index): + self.data.loc[self.array] + def time_loc_list_like(self, index): + self.data.loc[[800000]] -class indexing_dataframe_boolean_rows(object): - goal_time = 0.2 + def time_loc_scalar(self, index): + self.data.loc[800000] - def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) - self.obj_indexer = self.indexer.astype('O') + def time_loc_slice(self, index): + self.data.loc[:800000] - def time_indexing_dataframe_boolean_rows(self): - self.df[self.indexer] +class NonNumericSeriesIndexing(object): -class indexing_dataframe_boolean_rows_object(object): goal_time = 0.2 + params = ['string', 'datetime'] + param_names = ['index'] - def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) - self.obj_indexer = self.indexer.astype('O') - - def time_indexing_dataframe_boolean_rows_object(self): - self.df[self.obj_indexer] + def setup(self, index): + N = 10**5 + indexes = {'string': tm.makeStringIndex(N), + 'datetime': date_range('1900', periods=N, freq='s')} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.lbl = index[80000] + def time_getitem_label_slice(self, index): + self.s[:self.lbl] -class indexing_dataframe_boolean_st(object): - goal_time = 0.2 + def time_getitem_pos_slice(self, index): + self.s[:80000] - def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_numexpr_threads(1) + def time_get_value(self, index): + with warnings.catch_warnings(record=True): + self.s.get_value(self.lbl) - def time_indexing_dataframe_boolean_st(self): - (self.df > self.df2) + def time_getitem_scalar(self, index): + self.s[self.lbl] - def teardown(self): - expr.set_numexpr_threads() +class DataFrameStringIndexing(object): -class indexing_frame_get_value(object): goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] + index = tm.makeStringIndex(1000) + columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=index, + columns=columns) + self.idx_scalar = index[100] + self.col_scalar = columns[10] + self.bool_indexer = self.df[self.col_scalar] > 0 + self.bool_obj_indexer = self.bool_indexer.astype(object) - def time_indexing_frame_get_value(self): - self.df.get_value(self.idx, self.col) + def time_get_value(self): + with warnings.catch_warnings(record=True): + self.df.get_value(self.idx_scalar, self.col_scalar) + def time_ix(self): + self.df.ix[self.idx_scalar, self.col_scalar] -class indexing_frame_get_value_ix(object): - goal_time = 0.2 + def time_loc(self): + self.df.loc[self.idx_scalar, self.col_scalar] - def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] + def time_getitem_scalar(self): + self.df[self.col_scalar][self.idx_scalar] - def time_indexing_frame_get_value_ix(self): - self.df.ix[(self.idx, self.col)] + def time_boolean_rows(self): + self.df[self.bool_indexer] + def time_boolean_rows_object(self): + self.df[self.bool_obj_indexer] -class indexing_panel_subset(object): - goal_time = 0.2 - def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) +class DataFrameNumericIndexing(object): - def time_indexing_panel_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] - - -class multiindex_slicers(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) - self.idx = pd.IndexSlice - self.n = 100000 - self.mdt = pandas.DataFrame() - self.mdt['A'] = np.random.choice(range(10000, 45000, 1000), self.n) - self.mdt['B'] = np.random.choice(range(10, 400), self.n) - self.mdt['C'] = np.random.choice(range(1, 150), self.n) - self.mdt['D'] = np.random.choice(range(10000, 45000), self.n) - self.mdt['x'] = np.random.choice(range(400), self.n) - self.mdt['y'] = np.random.choice(range(25), self.n) - self.test_A = 25000 - self.test_B = 25 - self.test_C = 40 - self.test_D = 35000 - self.eps_A = 5000 - self.eps_B = 5 - self.eps_C = 5 - self.eps_D = 5000 - self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - - def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] - - -class series_getitem_array(object): - goal_time = 0.2 + self.idx_dupe = np.array(range(30)) * 99 + self.df = DataFrame(np.random.randn(10000, 5)) + self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) + self.bool_indexer = [True] * 5000 + [False] * 5000 - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_array(self): - self.s[np.arange(10000)] + def time_iloc_dups(self): + self.df_dup.iloc[self.idx_dupe] + def time_loc_dups(self): + self.df_dup.loc[self.idx_dupe] -class series_getitem_label_slice(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - self.lbl = self.s.index[800000] - - def time_series_getitem_label_slice(self): - self.s[:self.lbl] - + def time_iloc(self): + self.df.iloc[:100, 0] -class series_getitem_list_like(object): - goal_time = 0.2 + def time_loc(self): + self.df.loc[:100, 0] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_bool_indexer(self): + self.df[self.bool_indexer] - def time_series_getitem_list_like(self): - self.s[[800000]] +class Take(object): -class series_getitem_pos_slice(object): goal_time = 0.2 + params = ['int', 'datetime'] + param_names = ['index'] - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - - def time_series_getitem_pos_slice(self): - self.s[:800000] - - -class series_getitem_scalar(object): - goal_time = 0.2 + def setup(self, index): + N = 100000 + indexes = {'int': Int64Index(np.arange(N)), + 'datetime': date_range('2011-01-01', freq='S', periods=N)} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.indexer = [True, False, True, True, False] * 20000 - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_take(self, index): + self.s.take(self.indexer) - def time_series_getitem_scalar(self): - self.s[800000] +class MultiIndexing(object): -class series_getitem_slice(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_slice(self): - self.s[:800000] + mi = MultiIndex.from_product([range(1000), range(1000)]) + self.s = Series(np.random.randn(1000000), index=mi) + self.df = DataFrame(self.s) + n = 100000 + self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), + n), + 'B': np.random.choice(range(10, 400), n), + 'C': np.random.choice(range(1, 150), n), + 'D': np.random.choice(range(10000, 45000), n), + 'x': np.random.choice(range(400), n), + 'y': np.random.choice(range(25), n)}) + self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] + self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index() + + def time_series_ix(self): + self.s.ix[999] -class series_iloc_array(object): - goal_time = 0.2 + def time_frame_ix(self): + self.df.ix[999] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_index_slice(self): + self.mdt.loc[self.idx, :] - def time_series_iloc_array(self): - self.s.iloc[np.arange(10000)] +class IntervalIndexing(object): -class series_iloc_list_like(object): goal_time = 0.2 - def setup(self): - self.s = Series(np.random.rand(1000000)) + def setup_cache(self): + idx = IntervalIndex.from_breaks(np.arange(1000001)) + monotonic = Series(np.arange(1000000), index=idx) + return monotonic - def time_series_iloc_list_like(self): - self.s.iloc[[800000]] + def time_getitem_scalar(self, monotonic): + monotonic[80000] + def time_loc_scalar(self, monotonic): + monotonic.loc[80000] -class series_iloc_scalar(object): - goal_time = 0.2 + def time_getitem_list(self, monotonic): + monotonic[80000:] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_loc_list(self, monotonic): + monotonic.loc[80000:] - def time_series_iloc_scalar(self): - self.s.iloc[800000] +class CategoricalIndexIndexing(object): -class series_iloc_slice(object): goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def setup(self, index): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': CategoricalIndex(values), + 'monotonic_decr': CategoricalIndex(reversed(values)), + 'non_monotonic': CategoricalIndex(list('abc' * N))} + self.data = indices[index] - def time_series_iloc_slice(self): - self.s.iloc[:800000] + self.int_scalar = 10000 + self.int_list = list(range(10000)) + self.cat_scalar = 'b' + self.cat_list = ['a', 'c'] -class series_ix_array(object): - goal_time = 0.2 + def time_getitem_scalar(self, index): + self.data[self.int_scalar] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_getitem_slice(self, index): + self.data[:self.int_scalar] - def time_series_ix_array(self): - self.s.ix[np.arange(10000)] + def time_getitem_list_like(self, index): + self.data[[self.int_scalar]] + def time_getitem_list(self, index): + self.data[self.int_list] -class series_ix_list_like(object): - goal_time = 0.2 + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_get_loc_scalar(self, index): + self.data.get_loc(self.cat_scalar) - def time_series_ix_list_like(self): - self.s.ix[[800000]] + def time_get_indexer_list(self, index): + self.data.get_indexer(self.cat_list) -class series_ix_scalar(object): - goal_time = 0.2 +class PanelIndexing(object): - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_scalar(self): - self.s.ix[800000] - - -class series_ix_slice(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_slice(self): - self.s.ix[:800000] + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) + def time_subset(self): + with warnings.catch_warnings(record=True): + self.p.ix[(self.inds, self.inds, self.inds)] -class series_loc_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_array(self): - self.s.loc[np.arange(10000)] +class MethodLookup(object): -class series_loc_list_like(object): goal_time = 0.2 - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_list_like(self): - self.s.loc[[800000]] + def setup_cache(self): + s = Series() + return s + def time_lookup_iloc(self, s): + s.iloc -class series_loc_scalar(object): - goal_time = 0.2 + def time_lookup_ix(self, s): + s.ix - def setup(self): - self.s = Series(np.random.rand(1000000)) + def time_lookup_loc(self, s): + s.loc - def time_series_loc_scalar(self): - self.s.loc[800000] +class GetItemSingleColumn(object): -class series_loc_slice(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.rand(1000000)) + self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df_int_col = DataFrame(np.random.randn(3000, 1)) - def time_series_loc_slice(self): - self.s.loc[:800000] - - -class series_take_dtindex(object): - goal_time = 0.2 + def time_frame_getitem_single_column_label(self): + self.df_string_col['A'] - def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) + def time_frame_getitem_single_column_int(self): + self.df_int_col[0] - def time_series_take_dtindex(self): - self.ts.take(self.indexer) +class AssignTimeseriesIndex(object): -class series_take_intindex(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) - - def time_series_take_intindex(self): - self.s.take(self.indexer) - + N = 100000 + idx = date_range('1/1/2000', periods=N, freq='H') + self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) -class series_xs_mi_ix(object): - goal_time = 0.2 - - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) + def time_frame_assign_timeseries_index(self): + self.df['date'] = self.df.index - def time_series_xs_mi_ix(self): - self.s.ix[999] +class InsertColumns(object): -class sort_level_one(object): goal_time = 0.2 def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + self.N = 10**3 + self.df = DataFrame(index=range(self.N)) - def time_sort_level_one(self): - self.midx.sortlevel(1) - - -class sort_level_zero(object): - goal_time = 0.2 - - def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + def time_insert(self): + np.random.seed(1234) + for i in range(100): + self.df.insert(0, i, np.random.randn(self.N), + allow_duplicates=True) - def time_sort_level_zero(self): - self.midx.sortlevel(0) \ No newline at end of file + def time_assign_with_setitem(self): + np.random.seed(1234) + for i in range(100): + self.df[i] = np.random.randn(self.N) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 3fceed087facb..16d9e7cd73cbb 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,138 +1,113 @@ -from .pandas_vb_common import * -import pandas as pd +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, to_numeric +from .pandas_vb_common import numeric_dtypes, lib, setup # noqa -class dtype_infer_datetime64(object): + +class NumericInferOps(object): + # from GH 7332 goal_time = 0.2 + params = numeric_dtypes + param_names = ['dtype'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + def setup(self, dtype): + N = 5 * 10**5 + self.df = DataFrame({'A': np.arange(N).astype(dtype), + 'B': np.arange(N).astype(dtype)}) - def time_dtype_infer_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) + def time_add(self, dtype): + self.df['A'] + self.df['B'] + def time_subtract(self, dtype): + self.df['A'] - self.df['B'] -class dtype_infer_float32(object): - goal_time = 0.2 + def time_multiply(self, dtype): + self.df['A'] * self.df['B'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + def time_divide(self, dtype): + self.df['A'] / self.df['B'] - def time_dtype_infer_float32(self): - (self.df_float32['A'] + self.df_float32['B']) + def time_modulo(self, dtype): + self.df['A'] % self.df['B'] -class dtype_infer_float64(object): +class DateInferOps(object): + # from GH 7332 goal_time = 0.2 - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + def setup_cache(self): + N = 5 * 10**5 + df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) + df['timedelta'] = df['datetime64'] - df['datetime64'] + return df - def time_dtype_infer_float64(self): - (self.df_float64['A'] + self.df_float64['B']) + def time_subtract_datetimes(self, df): + df['datetime64'] - df['datetime64'] + def time_timedelta_plus_datetime(self, df): + df['timedelta'] + df['datetime64'] -class dtype_infer_int32(object): - goal_time = 0.2 + def time_add_timedeltas(self, df): + df['timedelta'] + df['timedelta'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - def time_dtype_infer_int32(self): - (self.df_int32['A'] + self.df_int32['B']) +class ToNumeric(object): - -class dtype_infer_int64(object): goal_time = 0.2 + params = ['ignore', 'coerce'] + param_names = ['errors'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + def setup(self, errors): + N = 10000 + self.float = Series(np.random.randn(N)) + self.numstr = self.float.astype('str') + self.str = Series(tm.makeStringIndex(N)) - def time_dtype_infer_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_from_float(self, errors): + to_numeric(self.float, errors=errors) + def time_from_numeric_str(self, errors): + to_numeric(self.numstr, errors=errors) -class dtype_infer_timedelta64_1(object): - goal_time = 0.2 + def time_from_str(self, errors): + to_numeric(self.str, errors=errors) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - def time_dtype_infer_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) +class ToNumericDowncast(object): + param_names = ['dtype', 'downcast'] + params = [['string-float', 'string-int', 'string-nint', 'datetime64', + 'int-list', 'int32'], + [None, 'integer', 'signed', 'unsigned', 'float']] -class dtype_infer_timedelta64_2(object): - goal_time = 0.2 + N = 500000 + N2 = int(N / 2) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + data_dict = {'string-int': ['1'] * N2 + [2] * N2, + 'string-nint': ['-1'] * N2 + [2] * N2, + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': ['1.1'] * N2 + [2] * N2, + 'int-list': [1] * N2 + [2] * N2, + 'int32': np.repeat(np.int32(1), N)} - def time_dtype_infer_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) + def setup(self, dtype, downcast): + self.data = self.data_dict[dtype] + def time_downcast(self, dtype, downcast): + to_numeric(self.data, downcast=downcast) -class dtype_infer_uint32(object): - goal_time = 0.2 - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) \ No newline at end of file +class MaybeConvertNumeric(object): + + def setup_cache(self): + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + data = arr.astype(object) + data[1::2] = arr[1::2].astype(str) + data[-1] = -1 + return data + + def time_convert(self, data): + lib.maybe_convert_numeric(data, set(), coerce_numeric=False) diff --git a/doc/sphinxext/ipython_sphinxext/__init__.py b/asv_bench/benchmarks/io/__init__.py similarity index 100% rename from doc/sphinxext/ipython_sphinxext/__init__.py rename to asv_bench/benchmarks/io/__init__.py diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py new file mode 100644 index 0000000000000..0f5d07f9fac55 --- /dev/null +++ b/asv_bench/benchmarks/io/csv.py @@ -0,0 +1,217 @@ +import random +import timeit +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Categorical, date_range, read_csv +from pandas.compat import PY2 +from pandas.compat import cStringIO as StringIO + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ToCSV(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ['wide', 'long', 'mixed'] + param_names = ['kind'] + + def setup(self, kind): + wide_frame = DataFrame(np.random.randn(3000, 30)) + long_frame = DataFrame({'A': np.arange(50000), + 'B': np.arange(50000) + 1., + 'C': np.arange(50000) + 2., + 'D': np.arange(50000) + 3.}) + mixed_frame = DataFrame({'float': np.random.randn(5000), + 'int': np.random.randn(5000).astype(int), + 'bool': (np.arange(5000) % 2) == 0, + 'datetime': date_range('2001', + freq='s', + periods=5000), + 'object': ['foo'] * 5000}) + mixed_frame.loc[30:500, 'float'] = np.nan + data = {'wide': wide_frame, + 'long': long_frame, + 'mixed': mixed_frame} + self.df = data[kind] + + def time_frame(self, kind): + self.df.to_csv(self.fname) + + +class ToCSVDatetime(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + rng = date_range('1/1/2000', periods=1000) + self.data = DataFrame(rng, index=rng) + + def time_frame_date_formatting(self): + self.data.to_csv(self.fname, date_format='%Y%m%d') + + +class ReadCSVDInferDatetimeFormat(object): + + goal_time = 0.2 + params = ([True, False], ['custom', 'iso8601', 'ymd']) + param_names = ['infer_datetime_format', 'format'] + + def setup(self, infer_datetime_format, format): + rng = date_range('1/1/2000', periods=1000) + formats = {'custom': '%m/%d/%Y %H:%M:%S.%f', + 'iso8601': '%Y-%m-%d %H:%M:%S', + 'ymd': '%Y%m%d'} + dt_format = formats[format] + self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist())) + + def time_read_csv(self, infer_datetime_format, format): + read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'], + infer_datetime_format=infer_datetime_format) + + +class ReadCSVSkipRows(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = [None, 10000] + param_names = ['skiprows'] + + def setup(self, skiprows): + N = 20000 + index = tm.makeStringIndex(N) + df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + df.to_csv(self.fname) + + def time_skipprows(self, skiprows): + read_csv(self.fname, skiprows=skiprows) + + +class ReadUint64Integers(object): + + goal_time = 0.2 + + def setup(self): + self.na_values = [2**63 + 500] + arr = np.arange(10000).astype('uint64') + 2**63 + self.data1 = StringIO('\n'.join(arr.astype(str).tolist())) + arr = arr.astype(object) + arr[500] = -1 + self.data2 = StringIO('\n'.join(arr.astype(str).tolist())) + + def time_read_uint64(self): + read_csv(self.data1, header=None, names=['foo']) + + def time_read_uint64_neg_values(self): + read_csv(self.data2, header=None, names=['foo']) + + def time_read_uint64_na_values(self): + read_csv(self.data1, header=None, names=['foo'], + na_values=self.na_values) + + +class ReadCSVThousands(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ([',', '|'], [None, ',']) + param_names = ['sep', 'thousands'] + + def setup(self, sep, thousands): + N = 10000 + K = 8 + data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) + df = DataFrame(data) + if thousands is not None: + fmt = ':{}'.format(thousands) + fmt = '{' + fmt + '}' + df = df.applymap(lambda x: fmt.format(x)) + df.to_csv(self.fname, sep=sep) + + def time_thousands(self, sep, thousands): + read_csv(self.fname, sep=sep, thousands=thousands) + + +class ReadCSVComment(object): + + goal_time = 0.2 + + def setup(self): + data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) + self.s_data = StringIO('\n'.join(data)) + + def time_comment(self): + read_csv(self.s_data, comment='#', header=None, names=list('abc')) + + +class ReadCSVFloatPrecision(object): + + goal_time = 0.2 + params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) + param_names = ['sep', 'decimal', 'float_precision'] + + def setup(self, sep, decimal, float_precision): + floats = [''.join(random.choice(string.digits) for _ in range(28)) + for _ in range(15)] + rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' + data = rows * 5 + data = data.format(*floats) * 200 # 1000 x 3 strings csv + self.s_data = StringIO(data) + + def time_read_csv(self, sep, decimal, float_precision): + read_csv(self.s_data, sep=sep, header=None, names=list('abc'), + float_precision=float_precision) + + def time_read_csv_python_engine(self, sep, decimal, float_precision): + read_csv(self.s_data, sep=sep, header=None, engine='python', + float_precision=None, names=list('abc')) + + +class ReadCSVCategorical(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) + df.to_csv(self.fname, index=False) + + def time_convert_post(self): + read_csv(self.fname).apply(Categorical) + + def time_convert_direct(self): + read_csv(self.fname, dtype='category') + + +class ReadCSVParseDates(object): + + goal_time = 0.2 + + def setup(self): + data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n + {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n + {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n + {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n + {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n + """ + two_cols = ['KORD,19990127'] * 5 + data = data.format(*two_cols) + self.s_data = StringIO(data) + + def time_multiple_date(self): + read_csv(self.s_data, sep=',', header=None, + names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]]) + + def time_baseline(self): + read_csv(self.s_data, sep=',', header=None, parse_dates=[1], + names=list(string.digits[:9])) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py new file mode 100644 index 0000000000000..58ab6bb8046c5 --- /dev/null +++ b/asv_bench/benchmarks/io/excel.py @@ -0,0 +1,36 @@ +import numpy as np +from pandas import DataFrame, date_range, ExcelWriter, read_excel +from pandas.compat import BytesIO +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Excel(object): + + goal_time = 0.2 + params = ['openpyxl', 'xlsxwriter', 'xlwt'] + param_names = ['engine'] + + def setup(self, engine): + N = 2000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.bio_read = BytesIO() + self.writer_read = ExcelWriter(self.bio_read, engine=engine) + self.df.to_excel(self.writer_read, sheet_name='Sheet1') + self.writer_read.save() + self.bio_read.seek(0) + + def time_read_excel(self, engine): + read_excel(self.bio_read) + + def time_write_excel(self, engine): + bio_write = BytesIO() + bio_write.seek(0) + writer_write = ExcelWriter(bio_write, engine=engine) + self.df.to_excel(writer_write, sheet_name='Sheet1') + writer_write.save() diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py new file mode 100644 index 0000000000000..4b6e1d69af92d --- /dev/null +++ b/asv_bench/benchmarks/io/hdf.py @@ -0,0 +1,151 @@ +import warnings + +import numpy as np +from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class HDFStoreDataFrame(BaseIO): + + goal_time = 0.2 + + def setup(self): + N = 25000 + index = tm.makeStringIndex(N) + self.df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=index) + self.df_mixed = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + self.df_wide = DataFrame(np.random.randn(N, 100)) + self.start_wide = self.df_wide.index[10000] + self.stop_wide = self.df_wide.index[15000] + self.df2 = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=date_range('1/1/2000', periods=N)) + self.start = self.df2.index[10000] + self.stop = self.df2.index[15000] + self.df_wide2 = DataFrame(np.random.randn(N, 100), + index=date_range('1/1/2000', periods=N)) + self.df_dc = DataFrame(np.random.randn(N, 10), + columns=['C%03d' % i for i in range(10)]) + + self.fname = '__test__.h5' + + self.store = HDFStore(self.fname) + self.store.put('fixed', self.df) + self.store.put('fixed_mixed', self.df_mixed) + self.store.append('table', self.df2) + self.store.append('table_mixed', self.df_mixed) + self.store.append('table_wide', self.df_wide) + self.store.append('table_wide2', self.df_wide2) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store(self): + self.store.get('fixed') + + def time_read_store_mixed(self): + self.store.get('fixed_mixed') + + def time_write_store(self): + self.store.put('fixed_write', self.df) + + def time_write_store_mixed(self): + self.store.put('fixed_mixed_write', self.df_mixed) + + def time_read_store_table_mixed(self): + self.store.select('table_mixed') + + def time_write_store_table_mixed(self): + self.store.append('table_mixed_write', self.df_mixed) + + def time_read_store_table(self): + self.store.select('table') + + def time_write_store_table(self): + self.store.append('table_write', self.df) + + def time_read_store_table_wide(self): + self.store.select('table_wide') + + def time_write_store_table_wide(self): + self.store.append('table_wide_write', self.df_wide) + + def time_write_store_table_dc(self): + self.store.append('table_dc_write', self.df_dc, data_columns=True) + + def time_query_store_table_wide(self): + self.store.select('table_wide', where="index > self.start_wide and " + "index < self.stop_wide") + + def time_query_store_table(self): + self.store.select('table', where="index > self.start and " + "index < self.stop") + + def time_store_repr(self): + repr(self.store) + + def time_store_str(self): + str(self.store) + + def time_store_info(self): + self.store.info() + + +class HDFStorePanel(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.h5' + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=['E%03d' % i for i in range(25)]) + self.store = HDFStore(self.fname) + self.store.append('p1', self.p) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store_table_panel(self): + with warnings.catch_warnings(record=True): + self.store.select('p1') + + def time_write_store_table_panel(self): + with warnings.catch_warnings(record=True): + self.store.append('p2', self.p) + + +class HDF(BaseIO): + + goal_time = 0.2 + params = ['table', 'fixed'] + param_names = ['format'] + + def setup(self, format): + self.fname = '__test__.h5' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_hdf(self.fname, 'df', format=format) + + def time_read_hdf(self, format): + read_hdf(self.fname, 'df') + + def time_write_hdf(self, format): + self.df.to_hdf(self.fname, 'df', format=format) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py new file mode 100644 index 0000000000000..acfdd327c3b51 --- /dev/null +++ b/asv_bench/benchmarks/io/json.py @@ -0,0 +1,127 @@ +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, timedelta_range, concat, read_json + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ReadJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = (['split', 'index', 'records'], ['int', 'datetime']) + param_names = ['orient', 'index'] + + def setup(self, orient, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient=orient) + + def time_read_json(self, orient, index): + read_json(self.fname, orient=orient) + + +class ReadJSONLines(BaseIO): + + goal_time = 0.2 + fname = "__test_lines__.json" + params = ['int', 'datetime'] + param_names = ['index'] + + def setup(self, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient='records', lines=True) + + def time_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def time_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + def peakmem_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def peakmem_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + +class ToJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = ['split', 'columns', 'index'] + param_names = ['orient'] + + def setup(self, lines_orient): + N = 10**5 + ncols = 5 + index = date_range('20000101', periods=N, freq='H') + timedeltas = timedelta_range(start=1, periods=N, freq='s') + datetimes = date_range(start=1, periods=N, freq='s') + ints = np.random.randint(100000000, size=N) + floats = np.random.randn(N) + strings = tm.makeStringIndex(N) + self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) + self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) + self.df_td_int_ts = DataFrame({'td_1': timedeltas, + 'td_2': timedeltas, + 'int_1': ints, + 'int_2': ints, + 'ts_1': datetimes, + 'ts_2': datetimes}, + index=index) + self.df_int_floats = DataFrame({'int_1': ints, + 'int_2': ints, + 'int_3': ints, + 'float_1': floats, + 'float_2': floats, + 'float_3': floats}, + index=index) + self.df_int_float_str = DataFrame({'int_1': ints, + 'int_2': ints, + 'float_1': floats, + 'float_2': floats, + 'str_1': strings, + 'str_2': strings}, + index=index) + + def time_floats_with_int_index(self, orient): + self.df.to_json(self.fname, orient=orient) + + def time_floats_with_dt_index(self, orient): + self.df_date_idx.to_json(self.fname, orient=orient) + + def time_delta_int_tstamp(self, orient): + self.df_td_int_ts.to_json(self.fname, orient=orient) + + def time_float_int(self, orient): + self.df_int_floats.to_json(self.fname, orient=orient) + + def time_float_int_str(self, orient): + self.df_int_float_str.to_json(self.fname, orient=orient) + + def time_floats_with_int_idex_lines(self, orient): + self.df.to_json(self.fname, orient='records', lines=True) + + def time_floats_with_dt_index_lines(self, orient): + self.df_date_idx.to_json(self.fname, orient='records', lines=True) + + def time_delta_int_tstamp_lines(self, orient): + self.df_td_int_ts.to_json(self.fname, orient='records', lines=True) + + def time_float_int_lines(self, orient): + self.df_int_floats.to_json(self.fname, orient='records', lines=True) + + def time_float_int_str_lines(self, orient): + self.df_int_float_str.to_json(self.fname, orient='records', lines=True) diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py new file mode 100644 index 0000000000000..8ccce01117ca4 --- /dev/null +++ b/asv_bench/benchmarks/io/msgpack.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_msgpack +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class MSGPack(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.msg' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_msgpack(self.fname) + + def time_read_msgpack(self): + read_msgpack(self.fname) + + def time_write_msgpack(self): + self.df.to_msgpack(self.fname) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py new file mode 100644 index 0000000000000..2ad0fcca6eb26 --- /dev/null +++ b/asv_bench/benchmarks/io/pickle.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_pickle +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Pickle(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.pkl' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_pickle(self.fname) + + def time_read_pickle(self): + read_pickle(self.fname) + + def time_write_pickle(self): + self.df.to_pickle(self.fname) diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py new file mode 100644 index 0000000000000..526c524de7fff --- /dev/null +++ b/asv_bench/benchmarks/io/sas.py @@ -0,0 +1,21 @@ +import os + +from pandas import read_sas + + +class SAS(object): + + goal_time = 0.2 + params = ['sas7bdat', 'xport'] + param_names = ['format'] + + def setup(self, format): + # Read files that are located in 'pandas/io/tests/sas/data' + files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'} + file = files[format] + paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas', + 'tests', 'io', 'sas', 'data', file] + self.f = os.path.join(*paths) + + def time_read_msgpack(self, format): + read_sas(self.f, format=format) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py new file mode 100644 index 0000000000000..ef4e501e5f3b9 --- /dev/null +++ b/asv_bench/benchmarks/io/sql.py @@ -0,0 +1,132 @@ +import sqlite3 + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +from sqlalchemy import create_engine + +from ..pandas_vb_common import setup # noqa + + +class SQL(object): + + goal_time = 0.2 + params = ['sqlalchemy', 'sqlite'] + param_names = ['connection'] + + def setup(self, connection): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_all = 'SELECT * FROM {}'.format(self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe(self, connection): + self.df.to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query(self, connection): + read_sql_query(self.query_all, self.con) + + +class WriteSQLDtypes(object): + + goal_time = 0.2 + params = (['sqlalchemy', 'sqlite'], + ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) + param_names = ['connection', 'dtype'] + + def setup(self, connection, dtype): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe_column(self, connection, dtype): + self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query_select_column(self, connection, dtype): + read_sql_query(self.query_col, self.con) + + +class ReadSQLTable(object): + + goal_time = 0.2 + + def setup(self): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_all(self): + read_sql_table(self.table_name, self.con) + + def time_read_sql_table_parse_dates(self): + read_sql_table(self.table_name, self.con, columns=['datetime_string'], + parse_dates=['datetime_string']) + + +class ReadSQLTableDtypes(object): + + goal_time = 0.2 + + params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_column(self, dtype): + read_sql_table(self.table_name, self.con, columns=[dtype]) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py new file mode 100644 index 0000000000000..e0f5752ca930f --- /dev/null +++ b/asv_bench/benchmarks/io/stata.py @@ -0,0 +1,37 @@ +import numpy as np +from pandas import DataFrame, date_range, read_stata +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Stata(BaseIO): + + goal_time = 0.2 + params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] + param_names = ['convert_dates'] + + def setup(self, convert_dates): + self.fname = '__test__.dta' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min, + np.iinfo(np.int8).max - 27, N) + self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max - 27, N) + self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min, + np.iinfo(np.int32).max - 27, N) + self.df['float32_'] = np.array(np.random.randn(N), + dtype=np.float32) + self.convert_dates = {'index': convert_dates} + self.df.to_stata(self.fname, self.convert_dates) + + def time_read_stata(self, convert_dates): + read_stata(self.fname) + + def time_write_stata(self, convert_dates): + self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py deleted file mode 100644 index 0f15ab6e5e142..0000000000000 --- a/asv_bench/benchmarks/io_bench.py +++ /dev/null @@ -1,171 +0,0 @@ -from .pandas_vb_common import * -from pandas import concat, Timestamp, compat -try: - from StringIO import StringIO -except ImportError: - from io import StringIO -import timeit - - -class frame_to_csv(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(3000, 30)) - - def time_frame_to_csv(self): - self.df.to_csv('__test__.csv') - - -class frame_to_csv2(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'A': range(50000), }) - self.df['B'] = (self.df.A + 1.0) - self.df['C'] = (self.df.A + 2.0) - self.df['D'] = (self.df.A + 3.0) - - def time_frame_to_csv2(self): - self.df.to_csv('__test__.csv') - - -class frame_to_csv_date_formatting(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = DataFrame(self.rng, index=self.rng) - - def time_frame_to_csv_date_formatting(self): - self.data.to_csv('__test__.csv', date_format='%Y%m%d') - - -class frame_to_csv_mixed(object): - goal_time = 0.2 - - def setup(self): - self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=self.create_cols('float')) - self.df_int = DataFrame(np.random.randn(5000, 5), dtype='int64', columns=self.create_cols('int')) - self.df_bool = DataFrame(True, index=self.df_float.index, columns=self.create_cols('bool')) - self.df_object = DataFrame('foo', index=self.df_float.index, columns=self.create_cols('object')) - self.df_dt = DataFrame(Timestamp('20010101'), index=self.df_float.index, columns=self.create_cols('date')) - self.df_float.ix[30:500, 1:3] = np.nan - self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) - - def time_frame_to_csv_mixed(self): - self.df.to_csv('__test__.csv') - - def create_cols(self, name): - return [('%s%03d' % (name, i)) for i in range(5)] - - -class read_csv_infer_datetime_format_custom(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%m/%d/%Y %H:%M:%S.%f')))) - - def time_read_csv_infer_datetime_format_custom(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_csv_infer_datetime_format_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_ymd(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y%m%d')))) - - def time_read_csv_infer_datetime_format_ymd(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_skiprows(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(20000) - self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) - self.df.to_csv('__test__.csv') - - def time_read_csv_skiprows(self): - read_csv('__test__.csv', skiprows=10000) - - -class read_csv_standard(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_csv('__test__.csv') - - def time_read_csv_standard(self): - read_csv('__test__.csv') - - -class read_parse_dates_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_parse_dates_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo']) - - -class write_csv_standard(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_write_csv_standard(self): - self.df.to_csv('__test__.csv') - - -class read_csv_from_s3(object): - # Make sure that we can read part of a file from S3 without - # needing to download the entire thing. Use the timeit.default_timer - # to measure wall time instead of CPU time -- we want to see - # how long it takes to download the data. - timer = timeit.default_timer - params = ([None, "gzip", "bz2"], ["python", "c"]) - param_names = ["compression", "engine"] - - def setup(self, compression, engine): - if compression == "bz2" and engine == "c" and compat.PY2: - # The Python 2 C parser can't read bz2 from open files. - raise NotImplementedError - try: - import boto - except ImportError: - # Skip these benchmarks if `boto` is not installed. - raise NotImplementedError - - self.big_fname = "s3://pandas-test/large_random.csv" - - def time_read_nrows(self, compression, engine): - # Read a small number of rows from a huge (100,000 x 50) table. - ext = "" - if compression == "gzip": - ext = ".gz" - elif compression == "bz2": - ext = ".bz2" - pd.read_csv(self.big_fname + ext, nrows=10, - compression=compression, engine=engine) diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py deleted file mode 100644 index 9a6b21f9e067a..0000000000000 --- a/asv_bench/benchmarks/io_sql.py +++ /dev/null @@ -1,215 +0,0 @@ -import sqlalchemy -from .pandas_vb_common import * -import sqlite3 -from sqlalchemy import create_engine - - -class sql_datetime_read_and_parse_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_datetime_read_and_parse_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) - - -class sql_datetime_read_as_native_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_datetime_read_as_native_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime']) - - -class sql_datetime_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_datetime_write_sqlalchemy(self): - self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') - - -class sql_float_read_query_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_fallback(self): - read_sql_query('SELECT float FROM test_type', self.con) - - -class sql_float_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_sqlalchemy(self): - read_sql_query('SELECT float FROM test_type', self.engine) - - -class sql_float_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_table_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['float']) - - -class sql_float_write_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_float_write_fallback(self): - self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - - -class sql_float_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_float_write_sqlalchemy(self): - self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') - - -class sql_read_query_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_query_fallback(self): - read_sql_query('SELECT * FROM test2', self.con) - - -class sql_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_query_sqlalchemy(self): - read_sql_query('SELECT * FROM test2', self.engine) - - -class sql_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_table_sqlalchemy(self): - read_sql_table('test2', self.engine) - - -class sql_string_write_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_string_write_fallback(self): - self.df[['string']].to_sql('test_string', self.con, if_exists='replace') - - -class sql_string_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_string_write_sqlalchemy(self): - self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') - - -class sql_write_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_sql_write_fallback(self): - self.df.to_sql('test1', self.con, if_exists='replace') - - -class sql_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_sql_write_sqlalchemy(self): - self.df.to_sql('test1', self.engine, if_exists='replace') \ No newline at end of file diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1da0d37d4a8dd..de0a3b33da147 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,33 +1,25 @@ -from .pandas_vb_common import * +import warnings +import string +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, concat, merge, + merge_asof) +try: + from pandas import merge_ordered +except ImportError: + from pandas import ordered_merge as merge_ordered -class append_frame_single_homogenous(object): - goal_time = 0.2 - - def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.df2 = self.df1.copy() - self.df2.index = np.arange(10000, 20000) - self.mdf1 = self.df1.copy() - self.mdf1['obj1'] = 'bar' - self.mdf1['obj2'] = 'bar' - self.mdf1['int1'] = 5 - try: - self.mdf1.consolidate(inplace=True) - except: - pass - self.mdf2 = self.mdf1.copy() - self.mdf2.index = self.df2.index +from .pandas_vb_common import Panel, setup # noqa - def time_append_frame_single_homogenous(self): - self.df1.append(self.df2) +class Append(object): -class append_frame_single_mixed(object): goal_time = 0.2 def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.df1 = DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() @@ -35,325 +27,336 @@ def setup(self): self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: - self.mdf1.consolidate(inplace=True) + with warnings.catch_warnings(record=True): + self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index - def time_append_frame_single_mixed(self): + def time_append_homogenous(self): + self.df1.append(self.df2) + + def time_append_mixed(self): self.mdf1.append(self.mdf2) -class concat_empty_frames1(object): +class Concat(object): + goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() + def setup(self, axis): + N = 1000 + s = Series(N, index=tm.makeStringIndex(N)) + self.series = [s[i:- i] for i in range(1, 10)] * 50 + self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 + df = DataFrame({'A': range(N)}, + index=date_range('20130101', periods=N, freq='s')) + self.empty_left = [DataFrame(), df] + self.empty_right = [df, DataFrame()] - def time_concat_empty_frames1(self): - concat([self.df, self.empty]) + def time_concat_series(self, axis): + concat(self.series, axis=axis) + def time_concat_small_frames(self, axis): + concat(self.small_frames, axis=axis) -class concat_empty_frames2(object): - goal_time = 0.2 + def time_concat_empty_right(self, axis): + concat(self.empty_right, axis=axis) - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() + def time_concat_empty_left(self, axis): + concat(self.empty_left, axis=axis) - def time_concat_empty_frames2(self): - concat([self.empty, self.df]) +class ConcatPanels(object): -class concat_series_axis1(object): goal_time = 0.2 + params = ([0, 1, 2], [True, False]) + param_names = ['axis', 'ignore_index'] - def setup(self): - self.n = 1000 - self.indices = tm.makeStringIndex(1000) - self.s = Series(self.n, index=self.indices) - self.pieces = [self.s[i:(- i)] for i in range(1, 10)] - self.pieces = (self.pieces * 50) + def setup(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + panel_c = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='C')) + self.panels_c = [panel_c] * 20 + panel_f = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='F')) + self.panels_f = [panel_f] * 20 - def time_concat_series_axis1(self): - concat(self.pieces, axis=1) + def time_c_ordered(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + concat(self.panels_c, axis=axis, ignore_index=ignore_index) + def time_f_ordered(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + concat(self.panels_f, axis=axis, ignore_index=ignore_index) -class concat_small_frames(object): - goal_time = 0.2 - def setup(self): - self.df = pd.DataFrame(randn(5, 4)) +class ConcatDataFrames(object): - def time_concat_small_frames(self): - concat(([self.df] * 1000)) + goal_time = 0.2 + params = ([0, 1], [True, False]) + param_names = ['axis', 'ignore_index'] + def setup(self, axis, ignore_index): + frame_c = DataFrame(np.zeros((10000, 200), + dtype=np.float32, order='C')) + self.frame_c = [frame_c] * 20 + frame_f = DataFrame(np.zeros((10000, 200), + dtype=np.float32, order='F')) + self.frame_f = [frame_f] * 20 -class i8merge(object): - goal_time = 0.2 + def time_c_ordered(self, axis, ignore_index): + concat(self.frame_c, axis=axis, ignore_index=ignore_index) - def setup(self): - (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) - self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) - self.left['left'] = self.left.sum(axis=1) - self.i = np.random.permutation(len(self.left)) - self.right = self.left.iloc[self.i].copy() - self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) - self.right.index = np.arange(len(self.right)) - self.right['right'] *= (-1) + def time_f_ordered(self, axis, ignore_index): + concat(self.frame_f, axis=axis, ignore_index=ignore_index) - def time_i8merge(self): - merge(self.left, self.right, how='outer') +class Join(object): -class join_dataframe_index_multi(object): goal_time = 0.2 + params = [True, False] + param_names = ['sort'] - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - try: - self.DataFrame = DataMatrix - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + def setup(self, sort): + level1 = tm.makeStringIndex(10).values + level2 = tm.makeStringIndex(1000).values + label1 = np.arange(10).repeat(1000) + label2 = np.tile(np.arange(1000), 10) + index2 = MultiIndex(levels=[level1, level2], + labels=[label1, label2]) + self.df_multi = DataFrame(np.random.randn(len(index2), 4), + index=index2, + columns=['A', 'B', 'C', 'D']) - def time_join_dataframe_index_multi(self): - self.df.join(self.df_multi, on=['key1', 'key2']) + self.key1 = np.tile(level1.take(label1), 10) + self.key2 = np.tile(level2.take(label2), 10) + self.df = DataFrame({'data1': np.random.randn(100000), + 'data2': np.random.randn(100000), + 'key1': self.key1, + 'key2': self.key2}) + self.df_key1 = DataFrame(np.random.randn(len(level1), 4), + index=level1, + columns=['A', 'B', 'C', 'D']) + self.df_key2 = DataFrame(np.random.randn(len(level2), 4), + index=level2, + columns=['A', 'B', 'C', 'D']) -class join_dataframe_index_single_key_bigger(object): - goal_time = 0.2 + shuf = np.arange(100000) + np.random.shuffle(shuf) + self.df_shuf = self.df.reindex(self.df.index[shuf]) - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - try: - self.DataFrame = DataMatrix - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + def time_join_dataframe_index_multi(self, sort): + self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort) + + def time_join_dataframe_index_single_key_bigger(self, sort): + self.df.join(self.df_key2, on='key2', sort=sort) - def time_join_dataframe_index_single_key_bigger(self): - self.df.join(self.df_key2, on='key2') + def time_join_dataframe_index_single_key_small(self, sort): + self.df.join(self.df_key1, on='key1', sort=sort) + def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): + self.df_shuf.join(self.df_key2, on='key2', sort=sort) + + +class JoinIndex(object): -class join_dataframe_index_single_key_bigger_sort(object): goal_time = 0.2 def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - try: - self.DataFrame = DataMatrix - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + N = 50000 + self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), + columns=['jim', 'joe']) + self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)), + columns=['jolie', 'jolia']).set_index('jolie') - def time_join_dataframe_index_single_key_bigger_sort(self): - self.df_shuf.join(self.df_key2, on='key2', sort=True) + def time_left_outer_join_index(self): + self.left.join(self.right, on='jim') -class join_dataframe_index_single_key_small(object): +class JoinNonUnique(object): + # outer join of non-unique + # GH 6329 goal_time = 0.2 def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - try: - self.DataFrame = DataMatrix - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') + daily_dates = date_index.to_period('D').to_timestamp('S', 'S') + self.fracofday = date_index.values - daily_dates.values + self.fracofday = self.fracofday.astype('timedelta64[ns]') + self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0 + self.fracofday = Series(self.fracofday, daily_dates) + index = date_range(date_index.min(), date_index.max(), freq='D') + self.temp = Series(1.0, index)[self.fracofday.index] - def time_join_dataframe_index_single_key_small(self): - self.df.join(self.df_key1, on='key1') + def time_join_non_unique_equal(self): + self.fracofday * self.temp -class join_dataframe_integer_2key(object): - goal_time = 0.2 +class Merge(object): - def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) + goal_time = 0.2 + params = [True, False] + param_names = ['sort'] + + def setup(self, sort): + N = 10000 + indices = tm.makeStringIndex(N).values + indices2 = tm.makeStringIndex(N).values + key = np.tile(indices[:8000], 10) + key2 = np.tile(indices2[:8000], 10) + self.left = DataFrame({'key': key, 'key2': key2, + 'value': np.random.randn(80000)}) + self.right = DataFrame({'key': indices[2000:], + 'key2': indices2[2000:], + 'value2': np.random.randn(8000)}) + + self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), + 'key2': np.tile(np.arange(250).repeat(10), 4), + 'value': np.random.randn(10000)}) + self.df2 = DataFrame({'key1': np.arange(500), + 'value2': np.random.randn(500)}) self.df3 = self.df[:5000] - def time_join_dataframe_integer_2key(self): - merge(self.df, self.df3) + def time_merge_2intkey(self, sort): + merge(self.left, self.right, sort=sort) + + def time_merge_dataframe_integer_2key(self, sort): + merge(self.df, self.df3, sort=sort) + + def time_merge_dataframe_integer_key(self, sort): + merge(self.df, self.df2, on='key1', sort=sort) -class join_dataframe_integer_key(object): +class I8Merge(object): + goal_time = 0.2 + params = ['inner', 'outer', 'left', 'right'] + param_names = ['how'] - def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) - self.df3 = self.df[:5000] + def setup(self, how): + low, high, n = -1000, 1000, 10**6 + self.left = DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) + self.left['left'] = self.left.sum(axis=1) + self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) + self.right = self.right.reset_index(drop=True) + self.right['right'] *= -1 + + def time_i8merge(self, how): + merge(self.left, self.right, how=how) - def time_join_dataframe_integer_key(self): - merge(self.df, self.df2, on='key1') +class MergeCategoricals(object): -class join_non_unique_equal(object): goal_time = 0.2 def setup(self): - self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') - self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') - self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) - self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) - self.fracofday = TimeSeries(self.fracofday, self.daily_dates) - self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') - self.temp = TimeSeries(1.0, self.index) + self.left_object = DataFrame( + {'X': np.random.choice(range(0, 10), size=(10000,)), + 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) - def time_join_non_unique_equal(self): - (self.fracofday * self.temp[self.fracofday.index]) + self.right_object = DataFrame( + {'X': np.random.choice(range(0, 10), size=(10000,)), + 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) + self.left_cat = self.left_object.assign( + Y=self.left_object['Y'].astype('category')) + self.right_cat = self.right_object.assign( + Z=self.right_object['Z'].astype('category')) + + def time_merge_object(self): + merge(self.left_object, self.right_object, on='X') + + def time_merge_cat(self): + merge(self.left_cat, self.right_cat, on='X') -class left_outer_join_index(object): - goal_time = 0.2 + +class MergeOrdered(object): def setup(self): - np.random.seed(2718281) - self.n = 50000 - self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) - self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') + groups = tm.makeStringIndex(10).values + self.left = DataFrame({'group': groups.repeat(5000), + 'key': np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) + self.right = DataFrame({'key': np.arange(10000), + 'rvalue': np.random.randn(10000)}) - def time_left_outer_join_index(self): - self.left.join(self.right, on='jim') + def time_merge_ordered(self): + merge_ordered(self.left, self.right, on='key', left_by='group') -class merge_2intkey_nosort(object): - goal_time = 0.2 +class MergeAsof(object): def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + one_count = 200000 + two_count = 1000000 - def time_merge_2intkey_nosort(self): - merge(self.left, self.right, sort=False) + df1 = DataFrame( + {'time': np.random.randint(0, one_count / 20, one_count), + 'key': np.random.choice(list(string.ascii_uppercase), one_count), + 'key2': np.random.randint(0, 25, one_count), + 'value1': np.random.randn(one_count)}) + df2 = DataFrame( + {'time': np.random.randint(0, two_count / 20, two_count), + 'key': np.random.choice(list(string.ascii_uppercase), two_count), + 'key2': np.random.randint(0, 25, two_count), + 'value2': np.random.randn(two_count)}) + df1 = df1.sort_values('time') + df2 = df2.sort_values('time') -class merge_2intkey_sort(object): - goal_time = 0.2 + df1['time32'] = np.int32(df1.time) + df2['time32'] = np.int32(df2.time) - def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + self.df1a = df1[['time', 'value1']] + self.df2a = df2[['time', 'value2']] + self.df1b = df1[['time', 'key', 'value1']] + self.df2b = df2[['time', 'key', 'value2']] + self.df1c = df1[['time', 'key2', 'value1']] + self.df2c = df2[['time', 'key2', 'value2']] + self.df1d = df1[['time32', 'value1']] + self.df2d = df2[['time32', 'value2']] + self.df1e = df1[['time', 'key', 'key2', 'value1']] + self.df2e = df2[['time', 'key', 'key2', 'value2']] - def time_merge_2intkey_sort(self): - merge(self.left, self.right, sort=True) + def time_on_int(self): + merge_asof(self.df1a, self.df2a, on='time') + def time_on_int32(self): + merge_asof(self.df1d, self.df2d, on='time32') -class series_align_int64_index(object): - goal_time = 0.2 + def time_by_object(self): + merge_asof(self.df1b, self.df2b, on='time', by='key') - def setup(self): - self.n = 1000000 - self.sz = 500000 - self.rng = np.arange(0, 10000000000000, 10000000) - self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) - self.idx1 = np.sort(self.sample(self.stamps, self.sz)) - self.idx2 = np.sort(self.sample(self.stamps, self.sz)) - self.ts1 = Series(np.random.randn(self.sz), self.idx1) - self.ts2 = Series(np.random.randn(self.sz), self.idx2) + def time_by_int(self): + merge_asof(self.df1c, self.df2c, on='time', by='key2') - def time_series_align_int64_index(self): - (self.ts1 + self.ts2) + def time_multiby(self): + merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2']) - def sample(self, values, k): - self.sampler = np.random.permutation(len(values)) - return values.take(self.sampler[:k]) +class Align(object): -class series_align_left_monotonic(object): goal_time = 0.2 def setup(self): - self.n = 1000000 - self.sz = 500000 - self.rng = np.arange(0, 10000000000000, 10000000) - self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) - self.idx1 = np.sort(self.sample(self.stamps, self.sz)) - self.idx2 = np.sort(self.sample(self.stamps, self.sz)) - self.ts1 = Series(np.random.randn(self.sz), self.idx1) - self.ts2 = Series(np.random.randn(self.sz), self.idx2) + size = 5 * 10**5 + rng = np.arange(0, 10**13, 10**7) + stamps = np.datetime64('now').view('i8') + rng + idx1 = np.sort(np.random.choice(stamps, size, replace=False)) + idx2 = np.sort(np.random.choice(stamps, size, replace=False)) + self.ts1 = Series(np.random.randn(size), idx1) + self.ts2 = Series(np.random.randn(size), idx2) + + def time_series_align_int64_index(self): + self.ts1 + self.ts2 def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join='left') - - def sample(self, values, k): - self.sampler = np.random.permutation(len(values)) - return values.take(self.sampler[:k]) \ No newline at end of file diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py deleted file mode 100644 index fe610ef4cb376..0000000000000 --- a/asv_bench/benchmarks/miscellaneous.py +++ /dev/null @@ -1,30 +0,0 @@ -from .pandas_vb_common import * -from pandas.util.decorators import cache_readonly - - -class match_strings(object): - goal_time = 0.2 - - def setup(self): - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) - - def time_match_strings(self): - match(self.all, self.uniques) - - -class misc_cache_readonly(object): - goal_time = 0.2 - - def setup(self): - - - class Foo: - - @cache_readonly - def prop(self): - return 5 - self.obj = Foo() - - def time_misc_cache_readonly(self): - self.obj.prop \ No newline at end of file diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py new file mode 100644 index 0000000000000..0c92214795557 --- /dev/null +++ b/asv_bench/benchmarks/multiindex_object.py @@ -0,0 +1,140 @@ +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import date_range, MultiIndex + +from .pandas_vb_common import setup # noqa + + +class GetLoc(object): + + goal_time = 0.2 + + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], + names=['one', 'two', 'three']) + + def time_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + + def time_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + + def time_string_get_loc(self): + self.mi_small.get_loc((99, 'A', 'A')) + + def time_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + + +class Duplicates(object): + + goal_time = 0.2 + + def setup(self): + size = 65536 + arrays = [np.random.randint(0, 8192, size), + np.random.randint(0, 1024, size)] + mask = np.random.rand(size) < 0.1 + self.mi_unused_levels = MultiIndex.from_arrays(arrays) + self.mi_unused_levels = self.mi_unused_levels[mask] + + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + + +class Integer(object): + + goal_time = 0.2 + + def setup(self): + self.mi_int = MultiIndex.from_product([np.arange(1000), + np.arange(1000)], + names=['one', 'two']) + self.obj_index = np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object) + + def time_get_indexer(self): + self.mi_int.get_indexer(self.obj_index) + + def time_is_monotonic(self): + self.mi_int.is_monotonic + + +class Duplicated(object): + + goal_time = 0.2 + + def setup(self): + n, k = 200, 5000 + levels = [np.arange(n), + tm.makeStringIndex(n).values, + 1000 + np.arange(n)] + labels = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, labels=labels) + + def time_duplicated(self): + self.mi.duplicated() + + +class Sortlevel(object): + + goal_time = 0.2 + + def setup(self): + n = 1182720 + low, high = -4096, 4096 + arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1]] + self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] + + a = np.repeat(np.arange(100), 1000) + b = np.tile(np.arange(1000), 100) + self.mi = MultiIndex.from_arrays([a, b]) + self.mi = self.mi.take(np.random.permutation(np.arange(100000))) + + def time_sortlevel_int64(self): + self.mi_int.sortlevel() + + def time_sortlevel_zero(self): + self.mi.sortlevel(0) + + def time_sortlevel_one(self): + self.mi.sortlevel(1) + + +class Values(object): + + goal_time = 0.2 + + def setup_cache(self): + + level1 = range(1000) + level2 = date_range(start='1/1/2012', periods=100) + mi = MultiIndex.from_product([level1, level2]) + return mi + + def time_datetime_level_values_copy(self, mi): + mi.copy().values + + def time_datetime_level_values_sliced(self, mi): + mi[:10].values diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py new file mode 100644 index 0000000000000..e161b887ee86f --- /dev/null +++ b/asv_bench/benchmarks/offset.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +import warnings +from datetime import datetime + +import numpy as np +import pandas as pd +try: + import pandas.tseries.holiday # noqa +except ImportError: + pass + +hcal = pd.tseries.holiday.USFederalHolidayCalendar() +# These offests currently raise a NotImplimentedError with .apply_index() +non_apply = [pd.offsets.Day(), + pd.offsets.BYearEnd(), + pd.offsets.BYearBegin(), + pd.offsets.BQuarterEnd(), + pd.offsets.BQuarterBegin(), + pd.offsets.BMonthEnd(), + pd.offsets.BMonthBegin(), + pd.offsets.CustomBusinessDay(), + pd.offsets.CustomBusinessDay(calendar=hcal), + pd.offsets.CustomBusinessMonthBegin(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal)] +other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), + pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), + pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), + pd.offsets.DateOffset(months=2, days=2), + pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(), + pd.offsets.SemiMonthBegin()] +offsets = non_apply + other_offsets + + +class ApplyIndex(object): + + goal_time = 0.2 + + params = other_offsets + param_names = ['offset'] + + def setup(self, offset): + N = 10000 + self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + + def time_apply_index(self, offset): + offset.apply_index(self.rng) + + +class OnOffset(object): + + goal_time = 0.2 + + params = offsets + param_names = ['offset'] + + def setup(self, offset): + self.dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31)] + + def time_on_offset(self, offset): + for date in self.dates: + offset.onOffset(date) + + +class OffsetSeriesArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + N = 1000 + rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.data = pd.Series(rng) + + def time_add_offset(self, offset): + with warnings.catch_warnings(record=True): + self.data + offset + + +class OffsetDatetimeIndexArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + N = 1000 + self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') + + def time_add_offset(self, offset): + with warnings.catch_warnings(record=True): + self.data + offset + + +class OffestDatetimeArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + self.date = datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + + def time_apply(self, offset): + offset.apply(self.date) + + def time_apply_np_dt64(self, offset): + offset.apply(self.dt64) + + def time_add(self, offset): + self.date + offset + + def time_add_10(self, offset): + self.date + (10 * offset) + + def time_subtract(self, offset): + self.date - offset + + def time_subtract_10(self, offset): + self.date - (10 * offset) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py deleted file mode 100644 index 12e48295d8d05..0000000000000 --- a/asv_bench/benchmarks/packers.py +++ /dev/null @@ -1,857 +0,0 @@ -from .pandas_vb_common import * -from numpy.random import randint -import pandas as pd -from collections import OrderedDict -from pandas.compat import BytesIO -import sqlite3 -import os -from sqlalchemy import create_engine -import numpy as np -from random import randrange -from pandas.core import common as com - - -class packers_read_csv(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_csv(self.f) - - def time_packers_read_csv(self): - pd.read_csv(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_excel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_packers_read_excel(self): - self.bio.seek(0) - pd.read_excel(self.bio) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_hdf_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df2.to_hdf(self.f, 'df') - - def time_packers_read_hdf_store(self): - pd.read_hdf(self.f, 'df') - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_hdf_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df2.to_hdf(self.f, 'df', format='table') - - def time_packers_read_hdf_table(self): - pd.read_hdf(self.f, 'df') - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_json(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_json(self.f, orient='split') - self.df.index = np.arange(self.N) - - def time_packers_read_json(self): - pd.read_json(self.f, orient='split') - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_json_date_index(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_json(self.f, orient='split') - - def time_packers_read_json_date_index(self): - pd.read_json(self.f, orient='split') - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_pack(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df2.to_msgpack(self.f) - - def time_packers_read_pack(self): - pd.read_msgpack(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_pickle(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df2.to_pickle(self.f) - - def time_packers_read_pickle(self): - pd.read_pickle(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_sql(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.engine = create_engine('sqlite:///:memory:') - self.df2.to_sql('table', self.engine, if_exists='replace') - - def time_packers_read_sql(self): - pd.read_sql_table('table', self.engine) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_stata(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata(self): - pd.read_stata(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_stata_with_validation(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata_with_validation(self): - pd.read_stata(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_csv(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_csv(self): - self.df.to_csv(self.f) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_excel_openpyxl(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_openpyxl(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_excel_xlsxwriter(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlsxwriter(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_excel_xlwt(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlwt(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_store(self): - self.df2.to_hdf(self.f, 'df') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_table(self): - self.df2.to_hdf(self.f, 'df', table=True) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_T(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json_T(self): - self.df.to_json(self.f, orient='columns') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_date_index(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_json_date_index(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_delta_int_tstamp(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_delta_int_tstamp(self): - self.df_mixed.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_float_int(self): - self.df_mixed.to_json(self.f, orient='index') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int_T(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_float_int_T(self): - self.df_mixed.to_json(self.f, orient='columns') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int_str(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_float_int_str(self): - self.df_mixed.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_pack(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_pack(self): - self.df2.to_msgpack(self.f) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_pickle(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_pickle(self): - self.df2.to_pickle(self.f) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_sql(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.engine = create_engine('sqlite:///:memory:') - - def time_packers_write_sql(self): - self.df2.to_sql('table', self.engine, if_exists='replace') - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_stata(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_write_stata(self): - self.df.to_stata(self.f, {'index': 'tc', }) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_stata_with_validation(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_write_stata_with_validation(self): - self.df.to_stata(self.f, {'index': 'tc', }) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass \ No newline at end of file diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 3370131929c22..e255cd94f265b 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,31 +1,43 @@ -from pandas import * -import pandas as pd -from datetime import timedelta -from numpy.random import randn -from numpy.random import randint -from numpy.random import permutation -import pandas.util.testing as tm -import random +import os +from importlib import import_module + import numpy as np -import threading -try: - from pandas.compat import range -except ImportError: - pass - -np.random.seed(1234) -try: - import pandas._tseries as lib -except: - import pandas.lib as lib - -try: - Panel = WidePanel -except Exception: - pass - -# didn't add to namespace until later -try: - from pandas.core.index import MultiIndex -except ImportError: - pass +from pandas import Panel + +# Compatibility import for lib +for imp in ['pandas._libs.lib', 'pandas.lib']: + try: + lib = import_module(imp) + break + except: + pass + +numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, + np.float64, np.int16, np.int8, np.uint16, np.uint8] +datetime_dtypes = [np.datetime64, np.timedelta64] + + +def setup(*args, **kwargs): + # This function just needs to be imported into each benchmark file to + # set up the random seed before each function. + # http://asv.readthedocs.io/en/latest/writing_benchmarks.html + np.random.seed(1234) + + +class BaseIO(object): + """ + Base class for IO benchmarks + """ + fname = None + + def remove(self, f): + """Remove created files""" + try: + os.remove(f) + except: + # On Windows, attempting to remove a file that is in use + # causes an exception to be raised + pass + + def teardown(self, *args, **kwargs): + self.remove(self.fname) diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 0b0e73847aa96..ce946c76ed199 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,64 +1,60 @@ -from .pandas_vb_common import * +import warnings +from datetime import datetime, timedelta +from pandas import DataFrame, DatetimeIndex, date_range -class panel_from_dict_all_different_indexes(object): - goal_time = 0.2 - - def setup(self): - self.data_frames = {} - self.start = datetime(1990, 1, 1) - self.end = datetime(2012, 1, 1) - for x in range(100): - self.end += timedelta(days=1) - self.dr = np.asarray(date_range(self.start, self.end)) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df - - def time_panel_from_dict_all_different_indexes(self): - Panel.from_dict(self.data_frames) +from .pandas_vb_common import Panel, setup # noqa -class panel_from_dict_equiv_indexes(object): +class DifferentIndexes(object): goal_time = 0.2 def setup(self): self.data_frames = {} + start = datetime(1990, 1, 1) + end = datetime(2012, 1, 1) for x in range(100): - self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1))) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df + end += timedelta(days=1) + idx = date_range(start, end) + df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx) + self.data_frames[x] = df - def time_panel_from_dict_equiv_indexes(self): - Panel.from_dict(self.data_frames) + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) -class panel_from_dict_same_index(object): +class SameIndexes(object): + goal_time = 0.2 def setup(self): - self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1))) - self.data_frames = {} - for x in range(100): - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df + idx = DatetimeIndex(start=datetime(1990, 1, 1), + end=datetime(2012, 1, 1), + freq='D') + df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx) + self.data_frames = dict(enumerate([df] * 100)) - def time_panel_from_dict_same_index(self): - Panel.from_dict(self.data_frames) + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) -class panel_from_dict_two_different_indexes(object): +class TwoIndexes(object): + goal_time = 0.2 def setup(self): - self.data_frames = {} - self.start = datetime(1990, 1, 1) - self.end = datetime(2012, 1, 1) - for x in range(100): - if (x == 50): - self.end += timedelta(days=1) - self.dr = np.asarray(date_range(self.start, self.end)) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df - - def time_panel_from_dict_two_different_indexes(self): - Panel.from_dict(self.data_frames) \ No newline at end of file + start = datetime(1990, 1, 1) + end = datetime(2012, 1, 1) + df1 = DataFrame({'a': 0, 'b': 1, 'c': 2}, + index=DatetimeIndex(start=start, end=end, freq='D')) + end += timedelta(days=1) + df2 = DataFrame({'a': 0, 'b': 1, 'c': 2}, + index=DatetimeIndex(start=start, end=end, freq='D')) + dfs = [df1] * 50 + [df2] * 50 + self.data_frames = dict(enumerate(dfs)) + + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 90118eaf6e407..a5b1a92e9cf67 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,56 +1,24 @@ -from .pandas_vb_common import * +import warnings +import numpy as np -class panel_pct_change_items(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_items(self): - self.panel.pct_change(1, axis='items') - - -class panel_pct_change_major(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_major(self): - self.panel.pct_change(1, axis='major') +from .pandas_vb_common import Panel, setup # noqa -class panel_pct_change_minor(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_minor(self): - self.panel.pct_change(1, axis='minor') - +class PanelMethods(object): -class panel_shift(object): goal_time = 0.2 + params = ['items', 'major', 'minor'] + param_names = ['axis'] - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_shift(self): - self.panel.shift(1) - - -class panel_shift_minor(object): - goal_time = 0.2 + def setup(self, axis): + with warnings.catch_warnings(record=True): + self.panel = Panel(np.random.randn(100, 1000, 100)) - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + def time_pct_change(self, axis): + with warnings.catch_warnings(record=True): + self.panel.pct_change(1, axis=axis) - def time_panel_shift_minor(self): - self.panel.shift(1, axis='minor') \ No newline at end of file + def time_shift(self, axis): + with warnings.catch_warnings(record=True): + self.panel.shift(1, axis=axis) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py deleted file mode 100644 index 18cd4de6cc9c5..0000000000000 --- a/asv_bench/benchmarks/parser_vb.py +++ /dev/null @@ -1,112 +0,0 @@ -from .pandas_vb_common import * -import os -from pandas import read_csv, read_table -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - - -class read_csv_comment2(object): - goal_time = 0.2 - - def setup(self): - self.data = ['A,B,C'] - self.data = (self.data + (['1,2,3 # comment'] * 100000)) - self.data = '\n'.join(self.data) - - def time_read_csv_comment2(self): - read_csv(StringIO(self.data), comment='#') - - -class read_csv_default_converter(object): - goal_time = 0.2 - - def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) - - -class read_csv_precise_converter(object): - goal_time = 0.2 - - def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_precise_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='high') - - -class read_csv_roundtrip_converter(object): - goal_time = 0.2 - - def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_roundtrip_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='round_trip') - - -class read_csv_thou_vb(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.format = (lambda x: '{:,}'.format(x)) - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df = self.df.applymap(self.format) - self.df.to_csv('test.csv', sep='|') - - def time_read_csv_thou_vb(self): - read_csv('test.csv', sep='|', thousands=',') - - def teardown(self): - os.remove('test.csv') - - -class read_csv_vb(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df.to_csv('test.csv', sep='|') - - def time_read_csv_vb(self): - read_csv('test.csv', sep='|') - - def teardown(self): - os.remove('test.csv') - - -class read_table_multiple_date(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - - def time_read_table_multiple_date(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[[1, 2], [1, 3]]) - - -class read_table_multiple_date_baseline(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - - def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py new file mode 100644 index 0000000000000..c34f9a737473e --- /dev/null +++ b/asv_bench/benchmarks/period.py @@ -0,0 +1,121 @@ +from pandas import (DataFrame, Series, Period, PeriodIndex, date_range, + period_range) + + +class PeriodProperties(object): + + params = (['M', 'min'], + ['year', 'month', 'day', 'hour', 'minute', 'second', + 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth', + 'dayofweek', 'dayofyear', 'start_time', 'end_time']) + param_names = ['freq', 'attr'] + + def setup(self, freq, attr): + self.per = Period('2012-06-01', freq=freq) + + def time_property(self, freq, attr): + getattr(self.per, attr) + + +class PeriodUnaryMethods(object): + + params = ['M', 'min'] + param_names = ['freq'] + + def setup(self, freq): + self.per = Period('2012-06-01', freq=freq) + + def time_to_timestamp(self, freq): + self.per.to_timestamp() + + def time_now(self, freq): + self.per.now(freq) + + def time_asfreq(self, freq): + self.per.asfreq('A') + + +class PeriodIndexConstructor(object): + + goal_time = 0.2 + + params = ['D'] + param_names = ['freq'] + + def setup(self, freq): + self.rng = date_range('1985', periods=1000) + self.rng2 = date_range('1985', periods=1000).to_pydatetime() + + def time_from_date_range(self, freq): + PeriodIndex(self.rng, freq=freq) + + def time_from_pydatetime(self, freq): + PeriodIndex(self.rng2, freq=freq) + + +class DataFramePeriodColumn(object): + + goal_time = 0.2 + + def setup(self): + self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.df = DataFrame(index=range(len(self.rng))) + + def time_setitem_period_column(self): + self.df['col'] = self.rng + + def time_set_index(self): + # GH#21582 limited by comparisons of Period objects + self.df['col2'] = self.rng + self.df.set_index('col2', append=True) + + +class Algorithms(object): + + goal_time = 0.2 + + params = ['index', 'series'] + param_names = ['typ'] + + def setup(self, typ): + data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), + Period('2011-03', freq='M'), Period('2011-04', freq='M')] + + if typ == 'index': + self.vector = PeriodIndex(data * 1000, freq='M') + elif typ == 'series': + self.vector = Series(data * 1000) + + def time_drop_duplicates(self, typ): + self.vector.drop_duplicates() + + def time_value_counts(self, typ): + self.vector.value_counts() + + +class Indexing(object): + + goal_time = 0.2 + + def setup(self): + self.index = PeriodIndex(start='1985', periods=1000, freq='D') + self.series = Series(range(1000), index=self.index) + self.period = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.period) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_series_loc(self): + self.series.loc[self.period] + + def time_align(self): + DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index[:750].intersection(self.index[250:]) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index f46082ac6f288..5b49112b0e07d 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,19 +1,64 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, DatetimeIndex, date_range try: - from pandas import date_range + from pandas.plotting import andrews_curves except ImportError: + from pandas.tools.plotting import andrews_curves +import matplotlib +matplotlib.use('Agg') - def date_range(start=None, end=None, periods=None, freq=None): - return DatetimeIndex(start, end, periods=periods, offset=freq) +from .pandas_vb_common import setup # noqa -class plot_timeseries_period(object): +class Plotting(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randn(1000000)) + self.df = DataFrame({'col': self.s}) + + def time_series_plot(self): + self.s.plot() + + def time_frame_plot(self): + self.df.plot() + + +class TimeseriesPlotting(object): + goal_time = 0.2 def setup(self): - self.N = 2000 - self.M = 5 - self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) + N = 2000 + M = 5 + idx = date_range('1/1/1975', periods=N) + self.df = DataFrame(np.random.randn(N, M), index=idx) + + idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10], + idx.values[12:]))) + self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), + index=idx_irregular) + + def time_plot_regular(self): + self.df.plot() + + def time_plot_regular_compat(self): + self.df.plot(x_compat=True) + + def time_plot_irregular(self): + self.df2.plot() + + +class Misc(object): + + goal_time = 0.6 + + def setup(self): + N = 500 + M = 10 + self.df = DataFrame(np.random.randn(N, M)) + self.df['Name'] = ["A"] * N - def time_plot_timeseries_period(self): - self.df.plot() \ No newline at end of file + def time_plot_andrews_curves(self): + andrews_curves(self.df, "Name") diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index b1c039058ff8f..413427a16f40b 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,397 +1,172 @@ -from .pandas_vb_common import * -from random import shuffle +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index, + date_range) +from .pandas_vb_common import setup, lib # noqa -class dataframe_reindex(object): - goal_time = 0.2 - - def setup(self): - self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute()) - self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, columns=range(10)) - self.df['foo'] = 'bar' - self.rng2 = Index(self.rng[::2]) - - def time_dataframe_reindex(self): - self.df.reindex(self.rng2) - - -class frame_drop_dup_inplace(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_dup_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_dup_na_inplace(object): - goal_time = 0.2 +class Reindex(object): - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_dup_na_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_duplicates(object): goal_time = 0.2 def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_duplicates(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_drop_duplicates_int(object): - - def setup(self): - np.random.seed(1234) - self.N = 1000000 - self.K = 10000 - self.key1 = np.random.randint(0,self.K,size=self.N) - self.df = DataFrame({'key1': self.key1}) - - def time_frame_drop_duplicates_int(self): - self.df.drop_duplicates() - - -class frame_drop_duplicates_na(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_duplicates_na(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_fillna_many_columns_pad(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') - - -class frame_reindex_columns(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) - - def time_frame_reindex_columns(self): - self.df.reindex(columns=self.df.columns[1:5]) - - -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) - - -class lib_fast_zip(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) - - -class lib_fast_zip_fillna(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list) - - -class reindex_daterange_backfill(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_daterange_backfill(self): - self.backfill(self.ts2, self.ts.index) - - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_daterange_pad(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_daterange_pad(self): - self.pad(self.ts2, self.ts.index) - - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_fillna_backfill(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_fillna_backfill(self): - self.ts3.fillna(method='backfill') + rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + self.df = DataFrame(np.random.rand(10000, 10), index=rng, + columns=range(10)) + self.df['foo'] = 'bar' + self.rng_subset = Index(rng[::2]) + self.df2 = DataFrame(index=range(10000), + data=np.random.rand(10000, 30), columns=range(30)) + N = 5000 + K = 200 + level1 = tm.makeStringIndex(N).values.repeat(K) + level2 = np.tile(tm.makeStringIndex(K).values, N) + index = MultiIndex.from_arrays([level1, level2]) + self.s = Series(np.random.randn(N * K), index=index) + self.s_subset = self.s[::2] + + def time_reindex_dates(self): + self.df.reindex(self.rng_subset) + + def time_reindex_columns(self): + self.df2.reindex(columns=self.df.columns[1:5]) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_reindex_multiindex(self): + self.s.reindex(self.s_subset.index) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +class ReindexMethod(object): -class reindex_fillna_backfill_float32(object): goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_fillna_backfill_float32(self): - self.ts4.fillna(method='backfill') + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.ts = Series(np.random.randn(N), index=self.idx)[::2] - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_reindex_method(self, method): + self.ts.reindex(self.idx, method=method) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +class Fillna(object): -class reindex_fillna_pad(object): goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + ts = Series(np.random.randn(N), index=self.idx)[::2] + self.ts_reindexed = ts.reindex(self.idx) + self.ts_float32 = self.ts_reindexed.astype('float32') - def time_reindex_fillna_pad(self): - self.ts3.fillna(method='pad') + def time_reindexed(self, method): + self.ts_reindexed.fillna(method=method) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_float_32(self, method): + self.ts_float32.fillna(method=method) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +class LevelAlign(object): -class reindex_fillna_pad_float32(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + self.index = MultiIndex( + levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) + self.df = DataFrame(np.random.randn(len(self.index), 4), + index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), + index=self.index.levels[1]) - def time_reindex_fillna_pad_float32(self): - self.ts4.fillna(method='pad') + def time_align_level(self): + self.df.align(self.df_level, level=1, copy=False) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_reindex_level(self): + self.df_level.reindex(self.index, level=1) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +class DropDuplicates(object): -class reindex_frame_level_align(object): goal_time = 0.2 + params = [True, False] + param_names = ['inplace'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def setup(self, inplace): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + self.df = DataFrame({'key1': key1, 'key2': key2, + 'value': np.random.randn(N * K)}) + self.df_nan = self.df.copy() + self.df_nan.iloc[:10000, :] = np.nan - def time_reindex_frame_level_align(self): - self.df.align(self.df_level, level=1, copy=False) + self.s = Series(np.random.randint(0, 1000, size=10000)) + self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + N = 1000000 + K = 10000 + key1 = np.random.randint(0, K, size=N) + self.df_int = DataFrame({'key1': key1}) + self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), + dtype=bool)) -class reindex_frame_level_reindex(object): - goal_time = 0.2 + def time_frame_drop_dups(self, inplace): + self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_frame_drop_dups_na(self, inplace): + self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) - def time_reindex_frame_level_reindex(self): - self.df_level.reindex(self.df.index, level=1) + def time_series_drop_dups_int(self, inplace): + self.s.drop_duplicates(inplace=inplace) + def time_series_drop_dups_string(self, inplace): + self.s_str.drop_duplicates(inplace=inplace) -class reindex_multiindex(object): - goal_time = 0.2 + def time_frame_drop_dups_int(self, inplace): + self.df_int.drop_duplicates(inplace=inplace) - def setup(self): - self.N = 1000 - self.K = 20 - self.level1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.level2 = np.tile(tm.makeStringIndex(self.K).values, self.N) - self.index = MultiIndex.from_arrays([self.level1, self.level2]) - self.s1 = Series(np.random.randn((self.N * self.K)), index=self.index) - self.s2 = self.s1[::2] - - def time_reindex_multiindex(self): - self.s1.reindex(self.s2.index) + def time_frame_drop_dups_bool(self, inplace): + self.df_bool.drop_duplicates(inplace=inplace) -class series_align_irregular_string(object): +class Align(object): + # blog "pandas escaped the zoo" goal_time = 0.2 def setup(self): - self.n = 50000 - self.indices = tm.makeStringIndex(self.n) - self.subsample_size = 40000 - self.x = Series(np.random.randn(50000), self.indices) - self.y = Series(np.random.randn(self.subsample_size), index=self.sample(self.indices, self.subsample_size)) + n = 50000 + indices = tm.makeStringIndex(n) + subsample_size = 40000 + self.x = Series(np.random.randn(n), indices) + self.y = Series(np.random.randn(subsample_size), + index=np.random.choice(indices, subsample_size, + replace=False)) - def time_series_align_irregular_string(self): - (self.x + self.y) + def time_align_series_irregular_string(self): + self.x + self.y - def sample(self, values, k): - self.sampler = np.arange(len(values)) - shuffle(self.sampler) - return values.take(self.sampler[:k]) +class LibFastZip(object): -class series_drop_duplicates_int(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) - def time_series_drop_duplicates_int(self): - self.s.drop_duplicates() - - -class series_drop_duplicates_string(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - - def time_series_drop_duplicates_string(self): - self.s2.drop_duplicates() + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index e9f33ebfce0bd..41208125e8f32 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,48 +1,58 @@ -from .pandas_vb_common import * -from pandas.compat import range -from datetime import timedelta +import numpy as np +import pandas as pd +from .pandas_vb_common import setup # noqa + + +class FillNa(object): -class replace_fillna(object): goal_time = 0.2 + params = [True, False] + param_names = ['inplace'] + + def setup(self, inplace): + N = 10**6 + rng = pd.date_range('1/1/2000', periods=N, freq='min') + data = np.random.randn(N) + data[::2] = np.nan + self.ts = pd.Series(data, index=rng) + + def time_fillna(self, inplace): + self.ts.fillna(0.0, inplace=inplace) - def setup(self): - self.N = 1000000 - try: - self.rng = date_range('1/1/2000', periods=self.N, freq='min') - except NameError: - self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) - self.date_range = DateRange - self.ts = Series(np.random.randn(self.N), index=self.rng) + def time_replace(self, inplace): + self.ts.replace(np.nan, 0.0, inplace=inplace) - def time_replace_fillna(self): - self.ts.fillna(0.0, inplace=True) +class ReplaceDict(object): -class replace_large_dict(object): goal_time = 0.2 + params = [True, False] + param_names = ['inplace'] - def setup(self): - self.n = (10 ** 6) - self.start_value = (10 ** 5) - self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n))) - self.s = Series(np.random.randint(self.n, size=(10 ** 3))) + def setup(self, inplace): + N = 10**5 + start_value = 10**5 + self.to_rep = dict(enumerate(np.arange(N) + start_value)) + self.s = pd.Series(np.random.randint(N, size=10**3)) - def time_replace_large_dict(self): - self.s.replace(self.to_rep, inplace=True) + def time_replace_series(self, inplace): + self.s.replace(self.to_rep, inplace=inplace) -class replace_replacena(object): - goal_time = 0.2 +class Convert(object): + + goal_time = 0.5 + params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) + param_names = ['constructor', 'replace_data'] + + def setup(self, constructor, replace_data): + N = 10**3 + data = {'Series': pd.Series(np.random.randint(N, size=N)), + 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), + 'B': np.random.randint(N, size=N)})} + self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} + self.data = data[constructor] - def setup(self): - self.N = 1000000 - try: - self.rng = date_range('1/1/2000', periods=self.N, freq='min') - except NameError: - self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) - self.date_range = DateRange - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_replace_replacena(self): - self.ts.replace(np.nan, 0.0, inplace=True) \ No newline at end of file + def time_replace(self, constructor, replace_data): + self.data.replace(self.to_replace) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 604fa5092a231..9044b080c45f9 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,13 +1,16 @@ -from .pandas_vb_common import * -from pandas.core.reshape import melt +from itertools import product +import numpy as np +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long + +from .pandas_vb_common import setup # noqa + + +class Melt(object): -class melt_dataframe(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) self.df['id1'] = np.random.randint(0, 10, 10000) self.df['id2'] = np.random.randint(100, 1000, 10000) @@ -16,61 +19,116 @@ def time_melt_dataframe(self): melt(self.df, id_vars=['id1', 'id2']) -class reshape_pivot_time_series(object): +class Pivot(object): + goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) - self.index = date_range('1/1/2000', periods=10000, freq='h') - self.df = DataFrame(randn(10000, 50), index=self.index, columns=range(50)) - self.pdf = self.unpivot(self.df) - self.f = (lambda : self.pdf.pivot('date', 'variable', 'value')) + N = 10000 + index = date_range('1/1/2000', periods=N, freq='h') + data = {'value': np.random.randn(N * 50), + 'variable': np.arange(50).repeat(N), + 'date': np.tile(index.values, 50)} + self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.f() + self.df.pivot('date', 'variable', 'value') - def unpivot(self, frame): - (N, K) = frame.shape - self.data = {'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), } - return DataFrame(self.data, columns=['date', 'variable', 'value']) +class SimpleReshape(object): -class reshape_stack_simple(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + arrays = [np.arange(100).repeat(100), + np.roll(np.tile(np.arange(100), 100), 25)] + index = MultiIndex.from_arrays(arrays) + self.df = DataFrame(np.random.randn(10000, 4), index=index) self.udf = self.df.unstack(1) - def time_reshape_stack_simple(self): + def time_stack(self): self.udf.stack() + def time_unstack(self): + self.df.unstack(1) + + +class Unstack(object): -class reshape_unstack_simple(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + m = 100 + n = 1000 + + levels = np.arange(m) + index = MultiIndex.from_product([levels] * 2) + columns = np.arange(n) + values = np.arange(m * m * n).reshape(m * m, n) + self.df = DataFrame(values, index, columns) + self.df2 = self.df.iloc[:-1] + + def time_full_product(self): + self.df.unstack() + + def time_without_last_row(self): + self.df2.unstack() + + +class SparseIndex(object): + + goal_time = 0.2 + + def setup(self): + NUM_ROWS = 1000 + self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), + 'B': np.random.randint(50, size=NUM_ROWS), + 'C': np.random.randint(-10, 10, size=NUM_ROWS), + 'D': np.random.randint(-10, 10, size=NUM_ROWS), + 'E': np.random.randint(10, size=NUM_ROWS), + 'F': np.random.randn(NUM_ROWS)}) + self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E']) + + def time_unstack(self): + self.df.unstack() + + +class WideToLong(object): + + goal_time = 0.2 + + def setup(self): + nyrs = 20 + nidvars = 20 + N = 5000 + self.letters = list('ABCD') + yrvars = [l + str(num) + for l, num in product(self.letters, range(1, nyrs + 1))] + columns = [str(i) for i in range(nidvars)] + yrvars + self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), + columns=columns) + self.df['id'] = self.df.index + + def time_wide_to_long_big(self): + wide_to_long(self.df, self.letters, i='id', j='year') - def time_reshape_unstack_simple(self): - self.df.unstack(1) +class PivotTable(object): -class unstack_sparse_keyspace(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) - self.NUM_ROWS = 1000 - for iter in range(10): - self.df = DataFrame({'A': np.random.randint(50, size=self.NUM_ROWS), 'B': np.random.randint(50, size=self.NUM_ROWS), 'C': np.random.randint((-10), 10, size=self.NUM_ROWS), 'D': np.random.randint((-10), 10, size=self.NUM_ROWS), 'E': np.random.randint(10, size=self.NUM_ROWS), 'F': np.random.randn(self.NUM_ROWS), }) - self.idf = self.df.set_index(['A', 'B', 'C', 'D', 'E']) - if (len(self.idf.index.unique()) == self.NUM_ROWS): - break - - def time_unstack_sparse_keyspace(self): - self.idf.unstack() \ No newline at end of file + N = 100000 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + ind1 = np.random.randint(0, 3, size=N) + ind2 = np.random.randint(0, 2, size=N) + self.df = DataFrame({'key1': fac1.take(ind1), + 'key2': fac2.take(ind2), + 'key3': fac2.take(ind2), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + + def time_pivot_table(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3']) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 0000000000000..e3bf551fa5f2b --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,79 @@ +import pandas as pd +import numpy as np + +from .pandas_vb_common import setup # noqa + + +class Methods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_rolling(self, constructor, window, dtype, method): + getattr(self.roll, method)() + + +class VariableWindowMethods(Methods): + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['50s', '1h', '1d'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + index = pd.date_range('2017-01-01', periods=N, freq='5s') + self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) + + +class Pairwise(object): + + sample_time = 0.2 + params = ([10, 1000, None], + ['corr', 'cov'], + [True, False]) + param_names = ['window', 'method', 'pairwise'] + + def setup(self, window, method, pairwise): + N = 10**4 + arr = np.random.random(N) + self.df = pd.DataFrame(arr) + + def time_pairwise(self, window, method, pairwise): + if window is None: + r = self.df.expanding() + else: + r = self.df.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + + +class Quantile(object): + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + [0, 0.5, 1], + ['linear', 'nearest', 'lower', 'higher', 'midpoint']) + param_names = ['constructor', 'window', 'dtype', 'percentile'] + + def setup(self, constructor, window, dtype, percentile, interpolation): + N = 10 ** 5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_quantile(self, constructor, window, dtype, percentile, + interpolation): + self.roll.quantile(percentile, interpolation=interpolation) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 37969a6949157..a5ccf5c32b876 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,73 +1,136 @@ -from .pandas_vb_common import * +from datetime import datetime +import numpy as np +import pandas.util.testing as tm +from pandas import Series, date_range, NaT + +from .pandas_vb_common import setup # noqa + + +class SeriesConstructor(object): -class series_isin_int64(object): goal_time = 0.2 + params = [None, 'dict'] + param_names = ['data'] - def setup(self): - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64') - self.values = [1, 2] + def setup(self, data): + self.idx = date_range(start=datetime(2015, 10, 26), + end=datetime(2016, 1, 1), + freq='50s') + dict_data = dict(zip(self.idx, range(len(self.idx)))) + self.data = None if data is None else dict_data - def time_series_isin_int64(self): - self.s3.isin(self.values) + def time_constructor(self, data): + Series(data=self.data, index=self.idx) - def time_series_isin_int64_large(self): - self.s4.isin(self.values) +class IsIn(object): -class series_isin_object(object): goal_time = 0.2 + params = ['int64', 'object'] + param_names = ['dtype'] - def setup(self): - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + def setup(self, dtype): + self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) self.values = [1, 2] - self.s4 = self.s3.astype('object') - def time_series_isin_object(self): - self.s4.isin(self.values) + def time_isin(self, dtypes): + self.s.isin(self.values) + + +class NSort(object): + + goal_time = 0.2 + params = ['first', 'last', 'all'] + param_names = ['keep'] + + def setup(self, keep): + self.s = Series(np.random.randint(1, 10, 100000)) + + def time_nlargest(self, keep): + self.s.nlargest(3, keep=keep) + + def time_nsmallest(self, keep): + self.s.nsmallest(3, keep=keep) + + +class Dropna(object): + + goal_time = 0.2 + params = ['int', 'datetime'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**6 + data = {'int': np.random.randint(1, 10, N), + 'datetime': date_range('2000-01-01', freq='S', periods=N)} + self.s = Series(data[dtype]) + if dtype == 'datetime': + self.s[np.random.randint(1, N, 100)] = NaT + def time_dropna(self, dtype): + self.s.dropna() + + +class Map(object): + + goal_time = 0.2 + params = ['dict', 'Series'] + param_names = 'mapper' + + def setup(self, mapper): + map_size = 1000 + map_data = Series(map_size - np.arange(map_size)) + self.map_data = map_data if mapper == 'Series' else map_data.to_dict() + self.s = Series(np.random.randint(0, map_size, 10000)) + + def time_map(self, mapper): + self.s.map(self.map_data) + + +class Clip(object): -class series_nlargest1(object): goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + self.s = Series(np.random.randn(50)) - def time_series_nlargest1(self): - self.s1.nlargest(3, take_last=True) - self.s1.nlargest(3, take_last=False) + def time_clip(self): + self.s.clip(0, 1) -class series_nlargest2(object): +class ValueCounts(object): + + goal_time = 0.2 + params = ['int', 'float', 'object'] + param_names = ['dtype'] + + def setup(self, dtype): + self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) + + def time_value_counts(self, dtype): + self.s.value_counts() + + +class Dir(object): + goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + self.s = Series(index=tm.makeStringIndex(10000)) - def time_series_nlargest2(self): - self.s2.nlargest(3, take_last=True) - self.s2.nlargest(3, take_last=False) + def time_dir_strings(self): + dir(self.s) -class series_nsmallest2(object): +class SeriesGetattr(object): + # https://github.com/pandas-dev/pandas/issues/19764 goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + self.s = Series(1, + index=date_range("2012-01-01", freq='s', + periods=int(1e6))) - def time_series_nsmallest2(self): - self.s2.nsmallest(3, take_last=True) - self.s2.nsmallest(3, take_last=False) + def time_series_datetimeindex_repr(self): + getattr(self.s, 'a', None) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index d7ee58fc978ea..dcb7694abc2ad 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,55 +1,162 @@ -from .pandas_vb_common import * -import pandas.sparse.series +import itertools + +import numpy as np import scipy.sparse -from pandas.core.sparse import SparseSeries, SparseDataFrame -from pandas.core.sparse import SparseDataFrame +from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, + date_range, MultiIndex) + +from .pandas_vb_common import setup # noqa + + +def make_array(size, dense_proportion, fill_value, dtype): + dense_size = int(size * dense_proportion) + arr = np.full(size, fill_value, dtype) + indexer = np.random.choice(np.arange(size), dense_size, replace=False) + arr[indexer] = np.random.choice(np.arange(100, dtype=dtype), dense_size) + return arr + +class SparseSeriesToFrame(object): -class sparse_series_to_frame(object): goal_time = 0.2 def setup(self): - self.K = 50 - self.N = 50000 - self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T')) + K = 50 + N = 50001 + rng = date_range('1/1/2000', periods=N, freq='T') self.series = {} - for i in range(1, (self.K + 1)): - self.data = np.random.randn(self.N)[:(- i)] - self.this_rng = self.rng[:(- i)] - self.data[100:] = np.nan - self.series[i] = SparseSeries(self.data, index=self.this_rng) + for i in range(1, K): + data = np.random.randn(N)[:-i] + idx = rng[:-i] + data[100:] = np.nan + self.series[i] = SparseSeries(data, index=idx) - def time_sparse_series_to_frame(self): + def time_series_to_frame(self): SparseDataFrame(self.series) -class sparse_frame_constructor(object): +class SparseArrayConstructor(object): + goal_time = 0.2 + params = ([0.1, 0.01], [0, np.nan], + [np.int64, np.float64, np.object]) + param_names = ['dense_proportion', 'fill_value', 'dtype'] + + def setup(self, dense_proportion, fill_value, dtype): + N = 10**6 + self.array = make_array(N, dense_proportion, fill_value, dtype) + + def time_sparse_array(self, dense_proportion, fill_value, dtype): + SparseArray(self.array, fill_value=fill_value, dtype=dtype) - def time_sparse_frame_constructor(self): - SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) +class SparseDataFrameConstructor(object): -class sparse_series_from_coo(object): goal_time = 0.2 def setup(self): - self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) + N = 1000 + self.arr = np.arange(N) + self.sparse = scipy.sparse.rand(N, N, 0.005) + self.dict = dict(zip(range(N), itertools.repeat([0]))) + + def time_constructor(self): + SparseDataFrame(columns=self.arr, index=self.arr) + + def time_from_scipy(self): + SparseDataFrame(self.sparse) + + def time_from_dict(self): + SparseDataFrame(self.dict) + + +class FromCoo(object): + + goal_time = 0.2 + + def setup(self): + self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], + ([1, 0, 0], [0, 2, 3])), + shape=(100, 100)) def time_sparse_series_from_coo(self): - self.ss = pandas.sparse.series.SparseSeries.from_coo(self.A) + SparseSeries.from_coo(self.matrix) -class sparse_series_to_coo(object): +class ToCoo(object): + goal_time = 0.2 def setup(self): - self.s = pd.Series(([np.nan] * 10000)) - self.s[0] = 3.0 - self.s[100] = (-1.0) - self.s[999] = 12.1 - self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) - self.ss = self.s.to_sparse() + s = Series([np.nan] * 10000) + s[0] = 3.0 + s[100] = -1.0 + s[999] = 12.1 + s.index = MultiIndex.from_product([range(10)] * 4) + self.ss = s.to_sparse() def time_sparse_series_to_coo(self): - self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) \ No newline at end of file + self.ss.to_coo(row_levels=[0, 1], + column_levels=[2, 3], + sort_labels=True) + + +class Arithmetic(object): + + goal_time = 0.2 + params = ([0.1, 0.01], [0, np.nan]) + param_names = ['dense_proportion', 'fill_value'] + + def setup(self, dense_proportion, fill_value): + N = 10**6 + arr1 = make_array(N, dense_proportion, fill_value, np.int64) + self.array1 = SparseArray(arr1, fill_value=fill_value) + arr2 = make_array(N, dense_proportion, fill_value, np.int64) + self.array2 = SparseArray(arr2, fill_value=fill_value) + + def time_make_union(self, dense_proportion, fill_value): + self.array1.sp_index.make_union(self.array2.sp_index) + + def time_intersect(self, dense_proportion, fill_value): + self.array1.sp_index.intersect(self.array2.sp_index) + + def time_add(self, dense_proportion, fill_value): + self.array1 + self.array2 + + def time_divide(self, dense_proportion, fill_value): + self.array1 / self.array2 + + +class ArithmeticBlock(object): + + goal_time = 0.2 + params = [np.nan, 0] + param_names = ['fill_value'] + + def setup(self, fill_value): + N = 10**6 + self.arr1 = self.make_block_array(length=N, num_blocks=1000, + block_size=10, fill_value=fill_value) + self.arr2 = self.make_block_array(length=N, num_blocks=1000, + block_size=10, fill_value=fill_value) + + def make_block_array(self, length, num_blocks, block_size, fill_value): + arr = np.full(length, fill_value) + indicies = np.random.choice(np.arange(0, length, block_size), + num_blocks, + replace=False) + for ind in indicies: + arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) + return SparseArray(arr, fill_value=fill_value) + + def time_make_union(self, fill_value): + self.arr1.sp_index.make_union(self.arr2.sp_index) + + def time_intersect(self, fill_value): + self.arr2.sp_index.intersect(self.arr2.sp_index) + + def time_addition(self, fill_value): + self.arr1 + self.arr2 + + def time_division(self, fill_value): + self.arr1 / self.arr2 diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 4125357455d2e..c447c78d0d070 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,236 +1,114 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +from .pandas_vb_common import setup # noqa -class stat_ops_frame_mean_float_axis_0(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) +ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', + 'var'] - def time_stat_ops_frame_mean_float_axis_0(self): - self.df.mean() +class FrameOps(object): -class stat_ops_frame_mean_float_axis_1(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [0, 1], [True, False]] + param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) - - def time_stat_ops_frame_mean_float_axis_1(self): - self.df.mean(1) - - -class stat_ops_frame_mean_int_axis_0(object): - goal_time = 0.2 + def setup(self, op, dtype, axis, use_bottleneck): + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.df_func = getattr(df, op) - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + def time_op(self, op, dtype, axis, use_bottleneck): + self.df_func(axis=axis) - def time_stat_ops_frame_mean_int_axis_0(self): - self.dfi.mean() +class FrameMultiIndexOps(object): -class stat_ops_frame_mean_int_axis_1(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) - - def time_stat_ops_frame_mean_int_axis_1(self): - self.dfi.mean(1) - - -class stat_ops_frame_sum_float_axis_0(object): - goal_time = 0.2 + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + df = pd.DataFrame(np.random.randn(len(index), 4), index=index) + self.df_func = getattr(df, op) - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + def time_op(self, level, op): + self.df_func(level=level) - def time_stat_ops_frame_sum_float_axis_0(self): - self.df.sum() +class SeriesOps(object): -class stat_ops_frame_sum_float_axis_1(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [True, False]] + param_names = ['op', 'dtype', 'use_bottleneck'] - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + def setup(self, op, dtype, use_bottleneck): + s = pd.Series(np.random.randn(100000)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s_func = getattr(s, op) - def time_stat_ops_frame_sum_float_axis_1(self): - self.df.sum(1) + def time_op(self, op, dtype, use_bottleneck): + self.s_func() -class stat_ops_frame_sum_int_axis_0(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) - - def time_stat_ops_frame_sum_int_axis_0(self): - self.dfi.sum() - +class SeriesMultiIndexOps(object): -class stat_ops_frame_sum_int_axis_1(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.df = DataFrame(np.random.randn(100000, 4)) - self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) - - def time_stat_ops_frame_sum_int_axis_1(self): - self.dfi.sum(1) - - -class stat_ops_level_frame_sum(object): - goal_time = 0.2 + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + s = pd.Series(np.random.randn(len(index)), index=index) + self.s_func = getattr(s, op) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_op(self, level, op): + self.s_func(level=level) - def time_stat_ops_level_frame_sum(self): - self.df.sum(level=1) +class Rank(object): -class stat_ops_level_frame_sum_multiple(object): goal_time = 0.2 + params = [['DataFrame', 'Series'], [True, False]] + param_names = ['constructor', 'pct'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_frame_sum_multiple(self): - self.df.sum(level=[0, 1]) - - -class stat_ops_level_series_sum(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_series_sum(self): - self.df[1].sum(level=1) - - -class stat_ops_level_series_sum_multiple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_series_sum_multiple(self): - self.df[1].sum(level=[0, 1]) - - -class stat_ops_series_std(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randn(100000), index=np.arange(100000)) - self.s[::2] = np.nan - - def time_stat_ops_series_std(self): - self.s.std() + def setup(self, constructor, pct): + values = np.random.randn(10**5) + self.data = getattr(pd, constructor)(values) + def time_rank(self, constructor, pct): + self.data.rank(pct=pct) -class stats_corr_spearman(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 30)) - - def time_stats_corr_spearman(self): - self.df.corr(method='spearman') - - -class stats_rank2d_axis0_average(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - - def time_stats_rank2d_axis0_average(self): - self.df.rank() - - -class stats_rank2d_axis1_average(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - - def time_stats_rank2d_axis1_average(self): - self.df.rank(1) - - -class stats_rank_average(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_average(self): - self.s.rank() - - -class stats_rank_average_int(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randint(0, 100000, size=200000) - self.s = Series(self.values) - - def time_stats_rank_average_int(self): - self.s.rank() - - -class stats_rank_pct_average(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_pct_average(self): - self.s.rank(pct=True) - - -class stats_rank_pct_average_old(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) + def time_average_old(self, constructor, pct): + self.data.rank(pct=pct) / len(self.data) - def time_stats_rank_pct_average_old(self): - (self.s.rank() / len(self.s)) +class Correlation(object): -class stats_rolling_mean(object): goal_time = 0.2 + params = ['spearman', 'kendall', 'pearson'] + param_names = ['method'] - def setup(self): - self.arr = np.random.randn(100000) + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(1000, 30)) - def time_stats_rolling_mean(self): - rolling_mean(self.arr, 100) \ No newline at end of file + def time_corr(self, method): + self.df.corr(method=method) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index e4f91b1b9c0c6..b203c8b0fa5c9 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,393 +1,147 @@ -from .pandas_vb_common import * -import string -import itertools as IT -import pandas.util.testing as testing +import warnings +import numpy as np +from pandas import Series +import pandas.util.testing as tm -class strings_cat(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_cat(self): - self.many.str.cat(sep=',') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Methods(object): -class strings_center(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + self.s = Series(tm.makeStringIndex(10**5)) - def time_strings_center(self): - self.many.str.center(100) + def time_cat(self): + self.s.str.cat(sep=',') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_center(self): + self.s.str.center(100) + def time_count(self): + self.s.str.count('A') -class strings_contains_few(object): - goal_time = 0.2 + def time_endswith(self): + self.s.str.endswith('A') - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_extract(self): + with warnings.catch_warnings(record=True): + self.s.str.extract('(\\w*)A(\\w*)') - def time_strings_contains_few(self): - self.few.str.contains('matchthis') + def time_findall(self): + self.s.str.findall('[A-Z]+') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_get(self): + self.s.str.get(0) + def time_len(self): + self.s.str.len() -class strings_contains_few_noregex(object): - goal_time = 0.2 + def time_match(self): + self.s.str.match('A') - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_pad(self): + self.s.str.pad(100, side='both') - def time_strings_contains_few_noregex(self): - self.few.str.contains('matchthis', regex=False) + def time_replace(self): + self.s.str.replace('A', '\x01\x01') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_slice(self): + self.s.str.slice(5, 15, 2) + def time_startswith(self): + self.s.str.startswith('A') -class strings_contains_many(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many(self): - self.many.str.contains('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_many_noregex(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many_noregex(self): - self.many.str.contains('matchthis', regex=False) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_count(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_count(self): - self.many.str.count('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_encode_decode(object): - goal_time = 0.2 - - def setup(self): - self.ser = Series(testing.makeUnicodeIndex()) + def time_strip(self): + self.s.str.strip('A') - def time_strings_encode_decode(self): - self.ser.str.encode('utf-8').str.decode('utf-8') + def time_rstrip(self): + self.s.str.rstrip('A') + def time_lstrip(self): + self.s.str.lstrip('A') -class strings_endswith(object): - goal_time = 0.2 + def time_title(self): + self.s.str.title() - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_upper(self): + self.s.str.upper() - def time_strings_endswith(self): - self.many.str.endswith('matchthis') + def time_lower(self): + self.s.str.lower() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Repeat(object): -class strings_extract(object): goal_time = 0.2 + params = ['int', 'array'] + param_names = ['repeats'] - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def setup(self, repeats): + N = 10**5 + self.s = Series(tm.makeStringIndex(N)) + repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} + self.repeat = repeat[repeats] - def time_strings_extract(self): - self.many.str.extract('(\\w*)matchthis(\\w*)') + def time_repeat(self, repeats): + self.s.str.repeat(self.repeat) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Contains(object): -class strings_findall(object): goal_time = 0.2 + params = [True, False] + param_names = ['regex'] - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def setup(self, regex): + self.s = Series(tm.makeStringIndex(10**5)) - def time_strings_findall(self): - self.many.str.findall('[A-Z]+') + def time_contains(self, regex): + self.s.str.contains('A', regex=regex) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Split(object): -class strings_get(object): goal_time = 0.2 + params = [True, False] + param_names = ['expand'] - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def setup(self, expand): + self.s = Series(tm.makeStringIndex(10**5)).str.join('--') - def time_strings_get(self): - self.many.str.get(0) + def time_split(self, expand): + self.s.str.split('--', expand=expand) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Dummies(object): -class strings_get_dummies(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') + self.s = Series(tm.makeStringIndex(10**5)).str.join('|') - def time_strings_get_dummies(self): + def time_get_dummies(self): self.s.str.get_dummies('|') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split(self): - self.many.str.join('--').str.split('--') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split_expand(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split_expand(self): - self.many.str.join('--').str.split('--', expand=True) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_len(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_len(self): - self.many.str.len() - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_lower(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lower(self): - self.many.str.lower() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Encode(object): - -class strings_lstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lstrip(self): - self.many.str.lstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_match(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_match(self): - self.many.str.match('mat..this') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_pad(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_pad(self): - self.many.str.pad(100, side='both') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_repeat(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_repeat(self): - self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_replace(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_replace(self): - self.many.str.replace('(matchthis)', '\x01\x01') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_rstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_rstrip(self): - self.many.str.rstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_slice(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_slice(self): - self.many.str.slice(5, 15, 2) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_startswith(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + self.ser = Series(tm.makeUnicodeIndex()) - def time_strings_startswith(self): - self.many.str.startswith('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_strip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_strip(self): - self.many.str.strip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_title(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_title(self): - self.many.str.title() + def time_encode_decode(self): + self.ser.str.encode('utf-8').str.decode('utf-8') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Slice(object): -class strings_upper(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_upper(self): - self.many.str.upper() + self.s = Series(['abcdefg', np.nan] * 500000) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) \ No newline at end of file + def time_vector_slice(self): + # GH 2602 + self.s.str[:5] diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 2f252a4d3e1dc..3fe75b3c34299 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,34 +1,129 @@ -from .pandas_vb_common import * -from pandas import to_timedelta +import datetime +import numpy as np +from pandas import Series, timedelta_range, to_timedelta, Timestamp, Timedelta + + +class TimedeltaConstructor(object): -class timedelta_convert_int(object): goal_time = 0.2 - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) + def time_from_int(self): + Timedelta(123456789) + + def time_from_unit(self): + Timedelta(1, unit='d') + + def time_from_components(self): + Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5, + microseconds=6, nanoseconds=7) + + def time_from_datetime_timedelta(self): + Timedelta(datetime.timedelta(days=1, seconds=1)) + + def time_from_np_timedelta(self): + Timedelta(np.timedelta64(1, 'ms')) + + def time_from_string(self): + Timedelta('1 days') + + def time_from_iso_format(self): + Timedelta('P4DT12H30M5S') + + def time_from_missing(self): + Timedelta('nat') - def time_timedelta_convert_int(self): - to_timedelta(self.arr, unit='s') +class ToTimedelta(object): -class timedelta_convert_string(object): goal_time = 0.2 def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr = ['{0} days'.format(i) for i in self.arr] + self.ints = np.random.randint(0, 60, size=10000) + self.str_days = [] + self.str_seconds = [] + for i in self.ints: + self.str_days.append('{0} days'.format(i)) + self.str_seconds.append('00:00:{0:02d}'.format(i)) + + def time_convert_int(self): + to_timedelta(self.ints, unit='s') + + def time_convert_string_days(self): + to_timedelta(self.str_days) + + def time_convert_string_seconds(self): + to_timedelta(self.str_seconds) + + +class ToTimedeltaErrors(object): + + goal_time = 0.2 + params = ['coerce', 'ignore'] + param_names = ['errors'] + + def setup(self, errors): + ints = np.random.randint(0, 60, size=10000) + self.arr = ['{0} days'.format(i) for i in ints] + self.arr[-1] = 'apple' + + def time_convert(self, errors): + to_timedelta(self.arr, errors=errors) - def time_timedelta_convert_string(self): - to_timedelta(self.arr) +class TimedeltaOps(object): -class timedelta_convert_string_seconds(object): goal_time = 0.2 def setup(self): - self.arr = np.random.randint(0, 60, size=10000) - self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] + self.td = to_timedelta(np.arange(1000000)) + self.ts = Timestamp('2000') + + def time_add_td_ts(self): + self.td + self.ts + + +class TimedeltaProperties(object): + + goal_time = 0.2 + + def setup_cache(self): + td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) + return td + + def time_timedelta_days(self, td): + td.days + + def time_timedelta_seconds(self, td): + td.seconds + + def time_timedelta_microseconds(self, td): + td.microseconds + + def time_timedelta_nanoseconds(self, td): + td.nanoseconds + + +class DatetimeAccessor(object): + + goal_time = 0.2 + + def setup_cache(self): + N = 100000 + series = Series(timedelta_range('1 days', periods=N, freq='h')) + return series + + def time_dt_accessor(self, series): + series.dt + + def time_timedelta_days(self, series): + series.dt.days + + def time_timedelta_seconds(self, series): + series.dt.seconds + + def time_timedelta_microseconds(self, series): + series.dt.microseconds - def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) \ No newline at end of file + def time_timedelta_nanoseconds(self, series): + series.dt.nanoseconds diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index db0c526f25c7b..eada401d2930b 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1,1163 +1,400 @@ -from pandas.tseries.converter import DatetimeConverter -from .pandas_vb_common import * -import pandas as pd +import warnings from datetime import timedelta -import datetime as dt + +import numpy as np +from pandas import to_datetime, date_range, Series, DataFrame, period_range +from pandas.tseries.frequencies import infer_freq try: - import pandas.tseries.holiday + from pandas.plotting._converter import DatetimeConverter except ImportError: - pass -from pandas.tseries.frequencies import infer_freq -import numpy as np - - -class dataframe_resample_max_numpy(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - - def time_dataframe_resample_max_numpy(self): - self.df.resample('1s', how=np.max) - - -class dataframe_resample_max_string(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - - def time_dataframe_resample_max_string(self): - self.df.resample('1s', how='max') - - -class dataframe_resample_mean_numpy(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - - def time_dataframe_resample_mean_numpy(self): - self.df.resample('1s', how=np.mean) - + from pandas.tseries.converter import DatetimeConverter -class dataframe_resample_mean_string(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) +from .pandas_vb_common import setup # noqa - def time_dataframe_resample_mean_string(self): - self.df.resample('1s', how='mean') +class DatetimeIndex(object): -class dataframe_resample_min_numpy(object): goal_time = 0.2 + params = ['dst', 'repeated', 'tz_aware', 'tz_naive'] + param_names = ['index_type'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def setup(self, index_type): + N = 100000 + dtidxes = {'dst': date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S'), + 'repeated': date_range(start='2000', + periods=N / 10, + freq='s').repeat(10), + 'tz_aware': date_range(start='2000', + periods=N, + freq='s', + tz='US/Eastern'), + 'tz_naive': date_range(start='2000', + periods=N, + freq='s')} + self.index = dtidxes[index_type] - def time_dataframe_resample_min_numpy(self): - self.df.resample('1s', how=np.min) + def time_add_timedelta(self, index_type): + self.index + timedelta(minutes=2) + def time_normalize(self, index_type): + self.index.normalize() -class dataframe_resample_min_string(object): - goal_time = 0.2 + def time_unique(self, index_type): + self.index.unique() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def time_to_time(self, index_type): + self.index.time - def time_dataframe_resample_min_string(self): - self.df.resample('1s', how='min') + def time_get(self, index_type): + self.index[0] + def time_timeseries_is_month_start(self, index_type): + self.index.is_month_start -class datetimeindex_add_offset(object): - goal_time = 0.2 + def time_to_date(self, index_type): + self.index.date - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='T') + def time_to_pydatetime(self, index_type): + self.index.to_pydatetime() - def time_datetimeindex_add_offset(self): - (self.rng + timedelta(minutes=2)) +class TzLocalize(object): -class datetimeindex_converter(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + dst_rng = date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', + end='10/29/2000 00:59:59', freq='S') + self.index = self.index.append(dst_rng) + self.index = self.index.append(dst_rng) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', + end='10/29/2000 3:00:00', + freq='S')) - def time_datetimeindex_converter(self): - DatetimeConverter.convert(self.rng, None, None) + def time_infer_dst(self): + self.index.tz_localize('US/Eastern', ambiguous='infer') -class datetimeindex_infer_dst(object): - goal_time = 0.2 +class ResetIndex(object): - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) - - def time_datetimeindex_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) - - -class datetimeindex_normalize(object): goal_time = 0.2 + params = [None, 'US/Eastern'] + param_names = 'tz' - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') - - def time_datetimeindex_normalize(self): - self.rng.normalize() - + def setup(self, tz): + idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz) + self.df = DataFrame(np.random.randn(1000, 2), index=idx) -class datetimeindex_unique(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='T') - self.index = self.rng.repeat(10) - - def time_datetimeindex_unique(self): - self.index.unique() - - -class dti_reset_index(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H') - self.df = DataFrame(np.random.randn(len(self.rng), 2), self.rng) - - def time_dti_reset_index(self): + def time_reest_datetimeindex(self, tz): self.df.reset_index() -class dti_reset_index_tz(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df = DataFrame(np.random.randn(len(self.rng), 2), index=self.rng) - - def time_dti_reset_index_tz(self): - self.df.reset_index() - +class Factorize(object): -class period_setitem(object): goal_time = 0.2 + params = [None, 'Asia/Tokyo'] + param_names = 'tz' - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) - self.df = DataFrame(index=range(len(self.rng))) - - def time_period_setitem(self): - self.df['col'] = self.rng - - -class timeseries_1min_5min_mean(object): - goal_time = 0.2 + def setup(self, tz): + N = 100000 + self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz) + self.dti = self.dti.repeat(5) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + def time_factorize(self, tz): + self.dti.factorize() - def time_timeseries_1min_5min_mean(self): - self.ts[:10000].resample('5min', how='mean') +class InferFreq(object): -class timeseries_1min_5min_ohlc(object): goal_time = 0.2 + params = [None, 'D', 'B'] + param_names = ['freq'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + def setup(self, freq): + if freq is None: + self.idx = date_range(start='1/1/1700', freq='D', periods=10000) + self.idx.freq = None + else: + self.idx = date_range(start='1/1/1700', freq=freq, periods=10000) - def time_timeseries_1min_5min_ohlc(self): - self.ts[:10000].resample('5min', how='ohlc') + def time_infer_freq(self, freq): + infer_freq(self.idx) -class timeseries_add_irregular(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.lindex = np.random.permutation(self.N)[:(self.N // 2)] - self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts.values.take(self.lindex), index=self.ts.index.take(self.lindex)) - self.right = Series(self.ts.values.take(self.rindex), index=self.ts.index.take(self.rindex)) +class TimeDatetimeConverter(object): - def time_timeseries_add_irregular(self): - (self.left + self.right) - - -class timeseries_asof(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') - - def time_timeseries_asof(self): - self.ts.asof(self.dates) + N = 100000 + self.rng = date_range(start='1/1/2000', periods=N, freq='T') + def time_convert(self): + DatetimeConverter.convert(self.rng, None, None) -class timeseries_asof_nan(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') - self.ts[250:5000] = np.nan - - def time_timeseries_asof_nan(self): - self.ts.asof(self.dates) +class Iteration(object): -class timeseries_asof_single(object): goal_time = 0.2 + params = [date_range, period_range] + param_names = ['time_index'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + def setup(self, time_index): + N = 10**6 + self.idx = time_index(start='20140101', freq='T', periods=N) + self.exit = 10000 - def time_timeseries_asof_single(self): - self.ts.asof(self.dates[0]) + def time_iter(self, time_index): + for _ in self.idx: + pass + def time_iter_preexit(self, time_index): + for i, _ in enumerate(self.idx): + if i > self.exit: + break -class timeseries_custom_bday_apply(object): - goal_time = 0.2 - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_apply(self): - self.cday.apply(self.date) - - -class timeseries_custom_bday_apply_dt64(object): - goal_time = 0.2 +class ResampleDataFrame(object): - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) - - -class timeseries_custom_bday_cal_decr(object): goal_time = 0.2 + params = ['max', 'mean', 'min'] + param_names = ['method'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_cal_decr(self): - (self.date - (1 * self.cdayh)) - - -class timeseries_custom_bday_cal_incr(object): - goal_time = 0.2 + def setup(self, method): + rng = date_range(start='20130101', periods=100000, freq='50L') + df = DataFrame(np.random.randn(100000, 2), index=rng) + self.resample = getattr(df.resample('1s'), method) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_cal_incr(self): - (self.date + (1 * self.cdayh)) - - -class timeseries_custom_bday_cal_incr_n(object): - goal_time = 0.2 + def time_method(self, method): + self.resample() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_cal_incr_n(self): - (self.date + (10 * self.cdayh)) - - -class timeseries_custom_bday_cal_incr_neg_n(object): - goal_time = 0.2 - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_cal_incr_neg_n(self): - (self.date - (10 * self.cdayh)) - - -class timeseries_custom_bday_decr(object): - goal_time = 0.2 +class ResampleSeries(object): - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_decr(self): - (self.date - self.cday) - - -class timeseries_custom_bday_incr(object): goal_time = 0.2 + params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc']) + param_names = ['index', 'freq', 'method'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_incr(self): - (self.date + self.cday) - - -class timeseries_custom_bmonthbegin_decr_n(object): - goal_time = 0.2 + def setup(self, index, freq, method): + indexes = {'period': period_range(start='1/1/2000', + end='1/1/2001', + freq='T'), + 'datetime': date_range(start='1/1/2000', + end='1/1/2001', + freq='T')} + idx = indexes[index] + ts = Series(np.random.randn(len(idx)), index=idx) + self.resample = getattr(ts.resample(freq), method) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - -class timeseries_custom_bmonthbegin_incr_n(object): - goal_time = 0.2 + def time_resample(self, index, freq, method): + self.resample() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class timeseries_custom_bmonthend_decr_n(object): - goal_time = 0.2 - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) - - -class timeseries_custom_bmonthend_incr(object): +class ResampleDatetetime64(object): + # GH 7754 goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthend_incr(self): - (self.date + self.cme) - - -class timeseries_custom_bmonthend_incr_n(object): - goal_time = 0.2 + rng3 = date_range(start='2000-01-01 00:00:00', + end='2000-01-01 10:00:00', freq='555000U') + self.dt_ts = Series(5, rng3, dtype='datetime64[ns]') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthend_incr_n(self): - (self.date + (10 * self.cme)) - - -class timeseries_datetimeindex_offset_delta(object): - goal_time = 0.2 + def time_resample(self): + self.dt_ts.resample('1S').last() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_delta(self): - (self.idx1 + self.delta_offset) - - -class timeseries_datetimeindex_offset_fast(object): - goal_time = 0.2 - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_fast(self): - (self.idx1 + self.fast_offset) - - -class timeseries_datetimeindex_offset_slow(object): - goal_time = 0.2 +class AsOf(object): - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_slow(self): - (self.idx1 + self.slow_offset) - - -class timeseries_day_apply(object): goal_time = 0.2 + params = ['DataFrame', 'Series'] + param_names = ['constructor'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_apply(self): - self.day.apply(self.date) - - -class timeseries_day_incr(object): - goal_time = 0.2 + def setup(self, constructor): + N = 10000 + M = 10 + rng = date_range(start='1/1/1990', periods=N, freq='53s') + data = {'DataFrame': DataFrame(np.random.randn(N, M)), + 'Series': Series(np.random.randn(N))} + self.ts = data[constructor] + self.ts.index = rng + self.ts2 = self.ts.copy() + self.ts2.iloc[250:5000] = np.nan + self.ts3 = self.ts.copy() + self.ts3.iloc[-5000:] = np.nan + self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') + self.date = self.dates[0] + self.date_last = self.dates[-1] + self.date_early = self.date - timedelta(10) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_incr(self): - (self.date + self.day) - - -class timeseries_infer_freq(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/1700', freq='D', periods=100000) - self.a = self.rng[:50000].append(self.rng[50002:]) + # test speed of pre-computing NAs. + def time_asof(self, constructor): + self.ts.asof(self.dates) - def time_timeseries_infer_freq(self): - infer_freq(self.a) + # should be roughly the same as above. + def time_asof_nan(self, constructor): + self.ts2.asof(self.dates) + # test speed of the code path for a scalar index + # without *while* loop + def time_asof_single(self, constructor): + self.ts.asof(self.date) -class timeseries_is_month_start(object): - goal_time = 0.2 + # test speed of the code path for a scalar index + # before the start. should be the same as above. + def time_asof_single_early(self, constructor): + self.ts.asof(self.date_early) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1', periods=self.N, freq='B') + # test the speed of the code path for a scalar index + # with a long *while* loop. should still be much + # faster than pre-computing all the NAs. + def time_asof_nan_single(self, constructor): + self.ts3.asof(self.date_last) - def time_timeseries_is_month_start(self): - self.rng.is_month_start +class SortIndex(object): -class timeseries_iter_datetimeindex(object): goal_time = 0.2 + params = [True, False] + param_names = ['monotonic'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex(self): - self.iter_n(self.idx1) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break + def setup(self, monotonic): + N = 10**5 + idx = date_range(start='1/1/2000', periods=N, freq='s') + self.s = Series(np.random.randn(N), index=idx) + if not monotonic: + self.s = self.s.sample(frac=1) + def time_sort_index(self, monotonic): + self.s.sort_index() -class timeseries_iter_datetimeindex_preexit(object): - goal_time = 0.2 + def time_get_slice(self, monotonic): + self.s[:10000] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex_preexit(self): - self.iter_n(self.idx1, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break +class IrregularOps(object): -class timeseries_iter_periodindex(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex(self): - self.iter_n(self.idx2) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break + N = 10**5 + idx = date_range(start='1/1/2000', periods=N, freq='s') + s = Series(np.random.randn(N), index=idx) + self.left = s.sample(frac=1) + self.right = s.sample(frac=1) + def time_add(self): + self.left + self.right -class timeseries_iter_periodindex_preexit(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex_preexit(self): - self.iter_n(self.idx2, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break +class Lookup(object): -class timeseries_large_lookup_value(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1500000, freq='S') - self.ts = Series(1, index=self.rng) - - def time_timeseries_large_lookup_value(self): - self.ts[self.ts.index[(len(self.ts) // 2)]] - self.ts.index._cleanup() - - -class timeseries_period_downsample_mean(object): - goal_time = 0.2 + N = 1500000 + rng = date_range(start='1/1/2000', periods=N, freq='S') + self.ts = Series(1, index=rng) + self.lookup_val = rng[N // 2] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + def time_lookup_and_cleanup(self): + self.ts[self.lookup_val] + self.ts.index._cleanup() - def time_timeseries_period_downsample_mean(self): - self.ts.resample('D', how='mean') +class ToDatetimeYYYYMMDD(object): -class timeseries_resample_datetime64(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') - self.int_ts = Series(5, self.rng, dtype='int64') - self.ts = self.int_ts.astype('datetime64[ns]') + rng = date_range(start='1/1/2000', periods=10000, freq='D') + self.stringsD = Series(rng.strftime('%Y%m%d')) - def time_timeseries_resample_datetime64(self): - self.ts.resample('1S', how='last') + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format='%Y%m%d') -class timeseries_series_offset_delta(object): - goal_time = 0.2 +class ToDatetimeISO8601(object): - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_delta(self): - (self.s + self.delta_offset) - - -class timeseries_series_offset_fast(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_fast(self): - (self.s + self.fast_offset) - - -class timeseries_series_offset_slow(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_slow(self): - (self.s + self.slow_offset) - - -class timeseries_slice_minutely(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_slice_minutely(self): - self.ts[:10000] + rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist() + self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist() + self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' + for x in rng] + def time_iso8601(self): + to_datetime(self.strings) -class timeseries_sort_index(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') - self.rng = self.rng.take(np.random.permutation(self.N)) - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_sort_index(self): - self.ts.sort_index() + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + def time_iso8601_format(self): + to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') -class timeseries_timestamp_downsample_mean(object): - goal_time = 0.2 + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) - def time_timeseries_timestamp_downsample_mean(self): - self.ts.resample('D', how='mean') +class ToDatetimeFormat(object): -class timeseries_timestamp_tzinfo_cons(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') + self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) + self.s2 = self.s.str.replace(':\\S+$', '') - def time_timeseries_timestamp_tzinfo_cons(self): - self.rng[0] + def time_exact(self): + to_datetime(self.s2, format='%d%b%y') + def time_no_exact(self): + to_datetime(self.s, format='%d%b%y', exact=False) -class timeseries_to_datetime_YYYYMMDD(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.strings = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) - - def time_timeseries_to_datetime_YYYYMMDD(self): - to_datetime(self.strings, format='%Y%m%d') +class ToDatetimeCache(object): -class timeseries_to_datetime_iso8601(object): goal_time = 0.2 + params = [True, False] + param_names = ['cache'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - - def time_timeseries_to_datetime_iso8601(self): - to_datetime(self.strings) - + def setup(self, cache): + N = 10000 + self.unique_numeric_seconds = list(range(N)) + self.dup_numeric_seconds = [1000] * N + self.dup_string_dates = ['2000-02-11'] * N + self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N -class timeseries_to_datetime_iso8601_format(object): - goal_time = 0.2 + def time_unique_seconds_and_unit(self, cache): + to_datetime(self.unique_numeric_seconds, unit='s', cache=cache) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - - def time_timeseries_to_datetime_iso8601_format(self): - to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + def time_dup_seconds_and_unit(self, cache): + to_datetime(self.dup_numeric_seconds, unit='s', cache=cache) + def time_dup_string_dates(self, cache): + to_datetime(self.dup_string_dates, cache=cache) -class timeseries_with_format_no_exact(object): - goal_time = 0.2 + def time_dup_string_dates_and_format(self, cache): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) + def time_dup_string_tzoffset_dates(self, cache): + to_datetime(self.dup_string_with_tz, cache=cache) -class timeseries_with_format_replace(object): - goal_time = 0.2 +class DatetimeAccessor(object): def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_replace(self): - to_datetime(self.s.str.replace(':\\S+$', ''), format='%d%b%y') - + N = 100000 + self.series = Series(date_range(start='1/1/2000', periods=N, freq='T')) -class timeseries_year_apply(object): - goal_time = 0.2 + def time_dt_accessor(self): + self.series.dt - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_apply(self): - self.year.apply(self.date) - - -class timeseries_year_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_incr(self): - (self.date + self.year) \ No newline at end of file + def time_dt_accessor_normalize(self): + self.series.dt.normalize() diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py new file mode 100644 index 0000000000000..c142a9b59fc43 --- /dev/null +++ b/asv_bench/benchmarks/timestamp.py @@ -0,0 +1,119 @@ +import datetime + +from pandas import Timestamp +import pytz + + +class TimestampConstruction(object): + + def time_parse_iso8601_no_tz(self): + Timestamp('2017-08-25 08:16:14') + + def time_parse_iso8601_tz(self): + Timestamp('2017-08-25 08:16:14-0500') + + def time_parse_dateutil(self): + Timestamp('2017/08/25 08:16:14 AM') + + def time_parse_today(self): + Timestamp('today') + + def time_parse_now(self): + Timestamp('now') + + def time_fromordinal(self): + Timestamp.fromordinal(730120) + + def time_fromtimestamp(self): + Timestamp.fromtimestamp(1515448538) + + +class TimestampProperties(object): + goal_time = 0.2 + + _tzs = [None, pytz.timezone('Europe/Amsterdam')] + _freqs = [None, 'B'] + params = [_tzs, _freqs] + param_names = ['tz', 'freq'] + + def setup(self, tz, freq): + self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) + + def time_tz(self, tz, freq): + self.ts.tz + + def time_dayofweek(self, tz, freq): + self.ts.dayofweek + + def time_weekday_name(self, tz, freq): + self.ts.weekday_name + + def time_dayofyear(self, tz, freq): + self.ts.dayofyear + + def time_week(self, tz, freq): + self.ts.week + + def time_quarter(self, tz, freq): + self.ts.quarter + + def time_days_in_month(self, tz, freq): + self.ts.days_in_month + + def time_freqstr(self, tz, freq): + self.ts.freqstr + + def time_is_month_start(self, tz, freq): + self.ts.is_month_start + + def time_is_month_end(self, tz, freq): + self.ts.is_month_end + + def time_is_quarter_start(self, tz, freq): + self.ts.is_quarter_start + + def time_is_quarter_end(self, tz, freq): + self.ts.is_quarter_end + + def time_is_year_start(self, tz, freq): + self.ts.is_quarter_end + + def time_is_year_end(self, tz, freq): + self.ts.is_quarter_end + + def time_is_leap_year(self, tz, freq): + self.ts.is_quarter_end + + def time_microsecond(self, tz, freq): + self.ts.microsecond + + +class TimestampOps(object): + goal_time = 0.2 + + params = [None, 'US/Eastern'] + param_names = ['tz'] + + def setup(self, tz): + self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) + + def time_replace_tz(self, tz): + self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) + + def time_replace_None(self, tz): + self.ts.replace(tzinfo=None) + + def time_to_pydatetime(self, tz): + self.ts.to_pydatetime() + + +class TimestampAcrossDst(object): + goal_time = 0.2 + + def setup(self): + dt = datetime.datetime(2016, 3, 27, 1) + self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + self.ts2 = Timestamp(dt) + + def time_replace_across_dst(self): + self.ts2.replace(tzinfo=self.tzinfo) diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py index c3041ec2b1ba1..b1179387e65d5 100644 --- a/asv_bench/vbench_to_asv.py +++ b/asv_bench/vbench_to_asv.py @@ -69,7 +69,7 @@ def visit_ClassDef(self, node): return node def visit_TryExcept(self, node): - if any([isinstance(x, (ast.Import, ast.ImportFrom)) for x in node.body]): + if any(isinstance(x, (ast.Import, ast.ImportFrom)) for x in node.body): self.imports.append(node) else: self.generic_visit(node) @@ -114,7 +114,7 @@ def translate_module(target_module): l_vars = {} exec('import ' + target_module) in g_vars - print target_module + print(target_module) module = eval(target_module, g_vars) benchmarks = [] @@ -157,7 +157,7 @@ def translate_module(target_module): mod = os.path.basename(module) if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']: continue - print - print mod + print('') + print(mod) translate_module(mod.replace('.py', '')) diff --git a/bench/alignment.py b/bench/alignment.py deleted file mode 100644 index bc3134f597ee0..0000000000000 --- a/bench/alignment.py +++ /dev/null @@ -1,22 +0,0 @@ -# Setup -from pandas.compat import range, lrange -import numpy as np -import pandas -import la -N = 1000 -K = 50 -arr1 = np.random.randn(N, K) -arr2 = np.random.randn(N, K) -idx1 = lrange(N) -idx2 = lrange(K) - -# pandas -dma1 = pandas.DataFrame(arr1, idx1, idx2) -dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1]) - -# larry -lar1 = la.larry(arr1, [idx1, idx2]) -lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]]) - -for i in range(100): - result = lar1 + lar2 diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py deleted file mode 100644 index e1dcd3456e88d..0000000000000 --- a/bench/bench_dense_to_sparse.py +++ /dev/null @@ -1,14 +0,0 @@ -from pandas import * - -K = 100 -N = 100000 -rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) - -rng2 = np.asarray(rng).astype('M8[us]').astype('i8') - -series = {} -for i in range(1, K + 1): - data = np.random.randn(N)[:-i] - this_rng = rng2[:-i] - data[100:] = np.nan - series[i] = SparseSeries(data, index=this_rng) diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py deleted file mode 100644 index 427e0b1b10a22..0000000000000 --- a/bench/bench_get_put_value.py +++ /dev/null @@ -1,56 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -N = 1000 -K = 50 - - -def _random_index(howmany): - return Index([rands(10) for _ in range(howmany)]) - -df = DataFrame(np.random.randn(N, K), index=_random_index(N), - columns=_random_index(K)) - - -def get1(): - for col in df.columns: - for row in df.index: - _ = df[col][row] - - -def get2(): - for col in df.columns: - for row in df.index: - _ = df.get_value(row, col) - - -def put1(): - for col in df.columns: - for row in df.index: - df[col][row] = 0 - - -def put2(): - for col in df.columns: - for row in df.index: - df.set_value(row, col, 0) - - -def resize1(): - buf = DataFrame() - for col in df.columns: - for row in df.index: - buf = buf.set_value(row, col, 5.) - return buf - - -def resize2(): - from collections import defaultdict - - buf = defaultdict(dict) - for col in df.columns: - for row in df.index: - buf[col][row] = 5. - - return DataFrame(buf) diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py deleted file mode 100644 index d7a2853e1e7b2..0000000000000 --- a/bench/bench_groupby.py +++ /dev/null @@ -1,66 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -import string -import random - -k = 20000 -n = 10 - -foo = np.tile(np.array([rands(10) for _ in range(k)], dtype='O'), n) -foo2 = list(foo) -random.shuffle(foo) -random.shuffle(foo2) - -df = DataFrame({'A': foo, - 'B': foo2, - 'C': np.random.randn(n * k)}) - -import pandas._sandbox as sbx - - -def f(): - table = sbx.StringHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - - -def g(): - table = sbx.PyObjectHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - -ret = f() - -""" -import pandas._tseries as lib - -f = np.std - - -grouped = df.groupby(['A', 'B']) - -label_list = [ping.labels for ping in grouped.groupings] -shape = [len(ping.ids) for ping in grouped.groupings] - -from pandas.core.groupby import get_group_index - - -group_index = get_group_index(label_list, shape, - sort=True, xnull=True).astype('i4') - -ngroups = np.prod(shape) - -indexer = lib.groupsort_indexer(group_index, ngroups) - -values = df['C'].values.take(indexer) -group_index = group_index.take(indexer) - -f = lambda x: x.std(ddof=1) - -grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) -result = grouper.get_result() - -expected = grouped.std() -""" diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py deleted file mode 100644 index f3c3f8ba15f70..0000000000000 --- a/bench/bench_join_panel.py +++ /dev/null @@ -1,85 +0,0 @@ -# reasonably efficient - - -def create_panels_append(cls, panels): - """ return an append list of panels """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - # import pdb; pdb.set_trace() - # create a joint index for the axis - - def joint_index_for_axis(panels, axis): - s = set() - for p in panels: - s.update(list(getattr(p, axis))) - return sorted(list(s)) - - def reindex_on_axis(panels, axis, axis_reindex): - new_axis = joint_index_for_axis(panels, axis) - new_panels = [p.reindex(**{axis_reindex: new_axis, - 'copy': False}) for p in panels] - return new_panels, new_axis - # create the joint major index, dont' reindex the sub-panels - we are - # appending - major = joint_index_for_axis(panels, 'major_axis') - # reindex on minor axis - panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') - # reindex on items - panels, items = reindex_on_axis(panels, 'items', 'items') - # concatenate values - try: - values = np.concatenate([p.values for p in panels], axis=1) - except Exception as detail: - raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" - % (','.join(["%s" % p for p in panels]), str(detail))) - # pm('append - create_panel') - p = Panel(values, items=items, major_axis=major, - minor_axis=minor) - # pm('append - done') - return p - - -# does the job but inefficient (better to handle like you read a table in -# pytables...e.g create a LongPanel then convert to Wide) -def create_panels_join(cls, panels): - """ given an array of panels's, create a single panel """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - d = dict() - minor, major, items = set(), set(), set() - for panel in panels: - items.update(panel.items) - major.update(panel.major_axis) - minor.update(panel.minor_axis) - values = panel.values - for item, item_index in panel.items.indexMap.items(): - for minor_i, minor_index in panel.minor_axis.indexMap.items(): - for major_i, major_index in panel.major_axis.indexMap.items(): - try: - d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index] - except: - pass - # stack the values - minor = sorted(list(minor)) - major = sorted(list(major)) - items = sorted(list(items)) - # create the 3d stack (items x columns x indicies) - data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan) - for item in items]) - for major_i in major]).transpose() - for minor_i in minor]) - # construct the panel - return Panel(data, items, major, minor) -add_class_method(Panel, create_panels_join, 'join_many') diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py deleted file mode 100644 index 054fc36131b65..0000000000000 --- a/bench/bench_khash_dict.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Some comparisons of khash.h to Python dict -""" -from __future__ import print_function - -import numpy as np -import os - -from vbench.api import Benchmark -from pandas.util.testing import rands -from pandas.compat import range -import pandas._tseries as lib -import pandas._sandbox as sbx -import time - -import psutil - -pid = os.getpid() -proc = psutil.Process(pid) - - -def object_test_data(n): - pass - - -def string_test_data(n): - return np.array([rands(10) for _ in range(n)], dtype='O') - - -def int_test_data(n): - return np.arange(n, dtype='i8') - -N = 1000000 - -#---------------------------------------------------------------------- -# Benchmark 1: map_locations - - -def map_locations_python_object(): - arr = string_test_data(N) - return _timeit(lambda: lib.map_indices_object(arr)) - - -def map_locations_khash_object(): - arr = string_test_data(N) - - def f(): - table = sbx.PyObjectHashTable(len(arr)) - table.map_locations(arr) - return _timeit(f) - - -def _timeit(f, iterations=10): - start = time.time() - for _ in range(iterations): - foo = f() - elapsed = time.time() - start - return elapsed - -#---------------------------------------------------------------------- -# Benchmark 2: lookup_locations - - -def lookup_python(values): - table = lib.map_indices_object(values) - return _timeit(lambda: lib.merge_indexer_object(values, table)) - - -def lookup_khash(values): - table = sbx.PyObjectHashTable(len(values)) - table.map_locations(values) - locs = table.lookup_locations(values) - # elapsed = _timeit(lambda: table.lookup_locations2(values)) - return table - - -def leak(values): - for _ in range(100): - print(proc.get_memory_info()) - table = lookup_khash(values) - # table.destroy() - -arr = string_test_data(N) - -#---------------------------------------------------------------------- -# Benchmark 3: unique - -#---------------------------------------------------------------------- -# Benchmark 4: factorize diff --git a/bench/bench_merge.R b/bench/bench_merge.R deleted file mode 100644 index 3ed4618494857..0000000000000 --- a/bench/bench_merge.R +++ /dev/null @@ -1,161 +0,0 @@ -library(plyr) -library(data.table) -N <- 10000 -indices = rep(NA, N) -indices2 = rep(NA, N) -for (i in 1:N) { - indices[i] <- paste(sample(letters, 10), collapse="") - indices2[i] <- paste(sample(letters, 10), collapse="") -} -left <- data.frame(key=rep(indices[1:8000], 10), - key2=rep(indices2[1:8000], 10), - value=rnorm(80000)) -right <- data.frame(key=indices[2001:10000], - key2=indices2[2001:10000], - value2=rnorm(8000)) - -right2 <- data.frame(key=rep(right$key, 2), - key2=rep(right$key2, 2), - value2=rnorm(16000)) - -left.dt <- data.table(left, key=c("key", "key2")) -right.dt <- data.table(right, key=c("key", "key2")) -right2.dt <- data.table(right2, key=c("key", "key2")) - -# left.dt2 <- data.table(left) -# right.dt2 <- data.table(right) - -## left <- data.frame(key=rep(indices[1:1000], 10), -## key2=rep(indices2[1:1000], 10), -## value=rnorm(100000)) -## right <- data.frame(key=indices[1:1000], -## key2=indices2[1:1000], -## value2=rnorm(10000)) - -timeit <- function(func, niter=10) { - timing = rep(NA, niter) - for (i in 1:niter) { - gc() - timing[i] <- system.time(func())[3] - } - mean(timing) -} - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=FALSE, sort=sort) -} - -plyr.join <- function(type) { - result <- plyr::join(left, right, by=c("key", "key2"), - type=type, match="first") -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - - -# many-to-many - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right2.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right2.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=FALSE, sort=sort) -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - diff --git a/bench/bench_merge.py b/bench/bench_merge.py deleted file mode 100644 index 330dba7b9af69..0000000000000 --- a/bench/bench_merge.py +++ /dev/null @@ -1,105 +0,0 @@ -import random -import gc -import time -from pandas import * -from pandas.compat import range, lrange, StringIO -from pandas.util.testing import rands - -N = 10000 -ngroups = 10 - - -def get_test_data(ngroups=100, n=N): - unique_groups = lrange(ngroups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - random.shuffle(arr) - return arr - -# aggregate multiple columns -# df = DataFrame({'key1' : get_test_data(ngroups=ngroups), -# 'key2' : get_test_data(ngroups=ngroups), -# 'data1' : np.random.randn(N), -# 'data2' : np.random.randn(N)}) - -# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10), -# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10), -# 'value' : np.random.randn(N // 10)}) -# result = merge.merge(df, df2, on='key2') - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -right2 = right.append(right, ignore_index=True) - - -join_methods = ['inner', 'outer', 'left', 'right'] -results = DataFrame(index=join_methods, columns=[False, True]) -niter = 10 -for sort in [False, True]: - for join_method in join_methods: - f = lambda: merge(left, right, how=join_method, sort=sort) - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - results[sort][join_method] = elapsed -# results.columns = ['pandas'] -results.columns = ['dont_sort', 'sort'] - - -# R results -# many to one -r_results = read_table(StringIO(""" base::merge plyr data.table -inner 0.2475 0.1183 0.1100 -outer 0.4213 0.1916 0.2090 -left 0.2998 0.1188 0.0572 -right 0.3102 0.0536 0.0376 -"""), sep='\s+') - -presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'}) -all_results = presults.join(r_results) - -all_results = all_results.div(all_results['pandas'], axis=0) - -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] - -sort_results = DataFrame.from_items([('pandas', results['sort']), - ('R', r_results['base::merge'])]) -sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] - - -nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), - ('R', r_results['base::merge'])]) -nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas'] - -# many to many - -# many to one -r_results = read_table(StringIO("""base::merge plyr data.table -inner 0.4610 0.1276 0.1269 -outer 0.9195 0.1881 0.2725 -left 0.6559 0.1257 0.0678 -right 0.6425 0.0522 0.0428 -"""), sep='\s+') - -all_results = presults.join(r_results) -all_results = all_results.div(all_results['pandas'], axis=0) -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py deleted file mode 100644 index 3ad4b810119c3..0000000000000 --- a/bench/bench_merge_sqlite.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -from collections import defaultdict -import gc -import time -from pandas import DataFrame -from pandas.util.testing import rands -from pandas.compat import range, zip -import random - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -# right2 = right.append(right, ignore_index=True) -# right = right2 - -# random.shuffle(key2) -# indices2 = indices.copy() -# random.shuffle(indices2) - -# Prepare Database -import sqlite3 -create_sql_indexes = True - -conn = sqlite3.connect(':memory:') -conn.execute( - 'create table left( key varchar(10), key2 varchar(10), value int);') -conn.execute( - 'create table right( key varchar(10), key2 varchar(10), value2 int);') -conn.executemany('insert into left values (?, ?, ?)', - zip(key, key2, left['value'])) -conn.executemany('insert into right values (?, ?, ?)', - zip(right['key'], right['key2'], right['value2'])) - -# Create Indices -if create_sql_indexes: - conn.execute('create index left_ix on left(key, key2)') - conn.execute('create index right_ix on right(key, key2)') - - -join_methods = ['inner', 'left outer', 'left'] # others not supported -sql_results = DataFrame(index=join_methods, columns=[False]) -niter = 5 -for sort in [False]: - for join_method in join_methods: - sql = """CREATE TABLE test as select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - sql = """select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - - if sort: - sql = '%s order by key, key2' % sql - f = lambda: list(conn.execute(sql)) # list fetches results - g = lambda: conn.execute(sql) # list fetches results - gc.disable() - start = time.time() - # for _ in range(niter): - g() - elapsed = (time.time() - start) / niter - gc.enable() - - cur = conn.execute("DROP TABLE test") - conn.commit() - - sql_results[sort][join_method] = elapsed - sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort'] - sql_results.index = ['inner', 'outer', 'left'] - - sql = """select * - from left - inner join right - on left.key=right.key - and left.key2 = right.key2;""" diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R deleted file mode 100644 index 06dc6a105bc43..0000000000000 --- a/bench/bench_pivot.R +++ /dev/null @@ -1,27 +0,0 @@ -library(reshape2) - - -n <- 100000 -a.size <- 5 -b.size <- 5 - -data <- data.frame(a=sample(letters[1:a.size], n, replace=T), - b=sample(letters[1:b.size], n, replace=T), - c=rnorm(n), - d=rnorm(n)) - -timings <- numeric() - -# acast(melt(data, id=c("a", "b")), a ~ b, mean) -# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean) - -for (i in 1:10) { - gc() - tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean, - subset=.(variable=="c"))) - timings[i] = tim[3] -} - -mean(timings) - -acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c")) diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py deleted file mode 100644 index 007bd0aaebc2f..0000000000000 --- a/bench/bench_pivot.py +++ /dev/null @@ -1,16 +0,0 @@ -from pandas import * -import string - - -n = 100000 -asize = 5 -bsize = 5 - -letters = np.asarray(list(string.letters), dtype=object) - -data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)], - bar=letters[:bsize][np.random.randint(0, bsize, n)], - baz=np.random.randn(n), - qux=np.random.randn(n))) - -table = pivot_table(data, xby=['foo', 'bar']) diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py deleted file mode 100644 index 0aa705118d970..0000000000000 --- a/bench/bench_sparse.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np - -from pandas import * -import pandas.core.sparse as spm -import pandas.compat as compat -reload(spm) -from pandas.core.sparse import * - -N = 10000. - -arr1 = np.arange(N) -index = Index(np.arange(N)) - -off = N // 10 -arr1[off: 2 * off] = np.NaN -arr1[4 * off: 5 * off] = np.NaN -arr1[8 * off: 9 * off] = np.NaN - -arr2 = np.arange(N) -arr2[3 * off // 2: 2 * off + off // 2] = np.NaN -arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN - -s1 = SparseSeries(arr1, index=index) -s2 = SparseSeries(arr2, index=index) - -is1 = SparseSeries(arr1, kind='integer', index=index) -is2 = SparseSeries(arr2, kind='integer', index=index) - -s1_dense = s1.to_dense() -s2_dense = s2.to_dense() - -if compat.is_platform_linux(): - pth = '/home/wesm/code/pandas/example' -else: - pth = '/Users/wesm/code/pandas/example' - -dm = DataFrame.load(pth) - -sdf = dm.to_sparse() - - -def new_data_like(sdf): - new_data = {} - for col, series in compat.iteritems(sdf): - new_data[col] = SparseSeries(np.random.randn(len(series.sp_values)), - index=sdf.index, - sparse_index=series.sp_index, - fill_value=series.fill_value) - - return SparseDataFrame(new_data) - -# data = {} -# for col, ser in dm.iteritems(): -# data[col] = SparseSeries(ser) - -dwp = Panel.fromDict({'foo': dm}) -# sdf = SparseDataFrame(data) - - -lp = stack_sparse_frame(sdf) - - -swp = SparsePanel({'A': sdf}) -swp = SparsePanel({'A': sdf, - 'B': sdf, - 'C': sdf, - 'D': sdf}) - -y = sdf -x = SparsePanel({'x1': sdf + new_data_like(sdf) / 10, - 'x2': sdf + new_data_like(sdf) / 10}) - -dense_y = sdf -dense_x = x.to_dense() - -# import hotshot, hotshot.stats -# prof = hotshot.Profile('test.prof') - -# benchtime, stones = prof.runcall(ols, y=y, x=x) - -# prof.close() - -# stats = hotshot.stats.load('test.prof') - -dense_model = ols(y=dense_y, x=dense_x) - -import pandas.stats.plm as plm -import pandas.stats.interface as face -reload(plm) -reload(face) - -# model = face.ols(y=y, x=x) diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py deleted file mode 100644 index 5fb584bcfe45f..0000000000000 --- a/bench/bench_take_indexing.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import print_function -import numpy as np - -from pandas import * -import pandas._tseries as lib - -from pandas import DataFrame -import timeit -from pandas.compat import zip - -setup = """ -from pandas import Series -import pandas._tseries as lib -import random -import numpy as np - -import random -n = %d -k = %d -arr = np.random.randn(n, k) -indexer = np.arange(n, dtype=np.int32) -indexer = indexer[::-1] -""" - -sizes = [100, 1000, 10000, 100000] -iters = [1000, 1000, 100, 1] - -fancy_2d = [] -take_2d = [] -cython_2d = [] - -n = 1000 - - -def _timeit(stmt, size, k=5, iters=1000): - timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) - return timer.timeit(n) / n - -for sz, its in zip(sizes, iters): - print(sz) - fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) - take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) - cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) - -df = DataFrame({'fancy': fancy_2d, - 'take': take_2d, - 'cython': cython_2d}) - -print(df) - -from pandas.rpy.common import r -r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') -r('set.seed(12345') -r('indexer <- sample(1:10000)') -r('mat[indexer,]') diff --git a/bench/bench_unique.py b/bench/bench_unique.py deleted file mode 100644 index 87bd2f2df586c..0000000000000 --- a/bench/bench_unique.py +++ /dev/null @@ -1,278 +0,0 @@ -from __future__ import print_function -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range, zip -import pandas._tseries as lib -import numpy as np -import matplotlib.pyplot as plt - -N = 50000 -K = 10000 - -groups = np.array([rands(10) for _ in range(K)], dtype='O') -groups2 = np.array([rands(10) for _ in range(K)], dtype='O') - -labels = np.tile(groups, N // K) -labels2 = np.tile(groups2, N // K) -data = np.random.randn(N) - - -def timeit(f, niter): - import gc - import time - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - return elapsed - - -def algo1(): - unique_labels = np.unique(labels) - result = np.empty(len(unique_labels)) - for i, label in enumerate(unique_labels): - result[i] = data[labels == label].sum() - - -def algo2(): - unique_labels = np.unique(labels) - indices = lib.groupby_indices(labels) - result = np.empty(len(unique_labels)) - - for i, label in enumerate(unique_labels): - result[i] = data.take(indices[label]).sum() - - -def algo3_nosort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo3_sort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=True) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - -import numpy as np -import random - - -# dict to hold results -counts = {} - -# a hack to generate random key, value pairs. -# 5k keys, 100k values -x = np.tile(np.arange(5000, dtype='O'), 20) -random.shuffle(x) -xarr = x -x = [int(y) for y in x] -data = np.random.uniform(0, 1, 100000) - - -def f(): - # groupby sum - for k, v in zip(x, data): - try: - counts[k] += v - except KeyError: - counts[k] = v - - -def f2(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(xarr, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo4(): - rizer = lib.DictFactorizer() - labs1, _ = rizer.factorize(labels, sort=False) - k1 = len(rizer.uniques) - - rizer = lib.DictFactorizer() - labs2, _ = rizer.factorize(labels2, sort=False) - k2 = len(rizer.uniques) - - group_id = labs1 * k2 + labs2 - max_group = k1 * k2 - - if max_group > 1e6: - rizer = lib.Int64Factorizer(len(group_id)) - group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) - max_group = len(rizer.uniques) - - out = np.empty(max_group) - counts = np.zeros(max_group, dtype='i4') - lib.group_add(out, counts, data, group_id) - -# cumtime percall filename:lineno(function) -# 0.592 0.592 :1() - # 0.584 0.006 groupby_ex.py:37(algo3_nosort) - # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} - # 0.047 0.000 {pandas._tseries.group_add} - # 0.002 0.000 numeric.py:65(zeros_like) - # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} - # 0.000 0.000 {numpy.core.multiarray.empty_like} - # 0.000 0.000 {numpy.core.multiarray.empty} - -# UNIQUE timings - -# N = 10000000 -# K = 500000 - -# groups = np.array([rands(10) for _ in range(K)], dtype='O') - -# labels = np.tile(groups, N // K) -data = np.random.randn(N) - -data = np.random.randn(N) - -Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] - -# Ks = [500000, 1000000, 2500000, 5000000, 10000000] - -import psutil -import os -import gc - -pid = os.getpid() -proc = psutil.Process(pid) - - -def dict_unique(values, expected_K, sort=False, memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - rizer = lib.DictFactorizer() - result = rizer.unique_int64(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - return result - - -def khash_unique(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.Factorizer(len(values)) - else: - rizer = lib.Factorizer(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_str(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.StringHashTable(len(values)) - else: - rizer = lib.StringHashTable(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_int64(values, expected_K, size_hint=False, sort=False): - if size_hint: - rizer = lib.Int64HashTable(len(values)) - else: - rizer = lib.Int64HashTable(100) - - result = [] - result = rizer.unique(values) - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def hash_bench(): - numpy = [] - dict_based = [] - dict_based_sort = [] - khash_hint = [] - khash_nohint = [] - for K in Ks: - print(K) - # groups = np.array([rands(10) for _ in range(K)]) - # labels = np.tile(groups, N // K).astype('O') - - groups = np.random.randint(0, long(100000000000), size=K) - labels = np.tile(groups, N // K) - dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) - khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) - khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, - size_hint=True), 20)) - - # memory, hard to get - # dict_based.append(np.mean([dict_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) - # for _ in range(10)])) - - # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, - # sort=True), 10)) - # numpy.append(timeit(lambda: np.unique(labels), 10)) - - # unique_timings = DataFrame({'numpy.unique' : numpy, - # 'dict, no sort' : dict_based, - # 'dict, sort' : dict_based_sort}, - # columns=['dict, no sort', - # 'dict, sort', 'numpy.unique'], - # index=Ks) - - unique_timings = DataFrame({'dict': dict_based, - 'khash, preallocate': khash_hint, - 'khash': khash_nohint}, - columns=['khash, preallocate', 'khash', 'dict'], - index=Ks) - - unique_timings.plot(kind='bar', legend=False) - plt.legend(loc='best') - plt.title('Unique on 100,000 values, int64') - plt.xlabel('Number of unique labels') - plt.ylabel('Mean execution time') - - plt.show() diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R deleted file mode 100644 index 69d0f7a9eec63..0000000000000 --- a/bench/bench_with_subset.R +++ /dev/null @@ -1,53 +0,0 @@ -library(microbenchmark) -library(data.table) - - -data.frame.subset.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -# data.table allows something very similar to query with an expression -# but we have chained comparisons AND we're faster BOO YAH! -data.table.subset.expression.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], - times=times)) -} - - -# compare against subset with data.table for good measure -data.table.subset.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -data.frame.with.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - - print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -data.table.with.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -bench <- function () { - data.frame.subset.bench() - data.table.subset.expression.bench() - data.table.subset.bench() - data.frame.with.bench() - data.table.with.bench() -} - - -bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py deleted file mode 100644 index 017401df3f7f3..0000000000000 --- a/bench/bench_with_subset.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python - -""" -Microbenchmarks for comparison with R's "with" and "subset" functions -""" - -from __future__ import print_function -import numpy as np -from numpy import array -from timeit import repeat as timeit -from pandas.compat import range, zip -from pandas import DataFrame - - -setup_common = """from pandas import DataFrame -from numpy.random import randn -df = DataFrame(randn(%d, 3), columns=list('abc')) -%s""" - - -setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" - - -def bench_with(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.eval(s, engine=%r)' % engine, - setup=setup_common % (n, setup_with), - repeat=repeat, number=times)) / times - - -setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" - - -def bench_subset(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.query(s, engine=%r)' % engine, - setup=setup_common % (n, setup_subset), - repeat=repeat, number=times)) / times - - -def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): - r = np.logspace(mn, mx, num=num).round().astype(int) - - ev = DataFrame(np.empty((num, len(engines))), columns=engines) - qu = ev.copy(deep=True) - - ev['size'] = qu['size'] = r - - for engine in engines: - for i, n in enumerate(r): - if verbose: - print('engine: %r, i == %d' % (engine, i)) - ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) - qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, - engine=engine) - - return ev, qu - - -def plot_perf(df, engines, title, filename=None): - from matplotlib.pyplot import figure, rc - - try: - from mpltools import style - except ImportError: - pass - else: - style.use('ggplot') - - rc('text', usetex=True) - - fig = figure(figsize=(4, 3), dpi=100) - ax = fig.add_subplot(111) - - for engine in engines: - ax.plot(df.size, df[engine], label=engine, lw=2) - - ax.set_xlabel('Number of Rows') - ax.set_ylabel('Time (s)') - ax.set_title(title) - ax.legend(loc='best') - ax.tick_params(top=False, right=False) - - fig.tight_layout() - - if filename is not None: - fig.savefig(filename) - - -if __name__ == '__main__': - import os - import pandas as pd - - pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') - - join = lambda p: os.path.join(static_path, p) - - fn = join('eval-query-perf-data.h5') - - engines = 'python', 'numexpr' - - if not os.path.exists(fn): - ev, qu = bench(verbose=True) - ev.to_hdf(fn, 'eval') - qu.to_hdf(fn, 'query') - else: - ev = pd.read_hdf(fn, 'eval') - qu = pd.read_hdf(fn, 'query') - - plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) - plot_perf(qu, engines, 'DataFrame.query()', - filename=join('query-perf.png')) - - plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', - filename=join('eval-perf-small.png')) - plot_perf(qu[qu.size <= 500000], engines, 'DataFrame.query()', - filename=join('query-perf-small.png')) diff --git a/bench/better_unique.py b/bench/better_unique.py deleted file mode 100644 index e03a4f433ce66..0000000000000 --- a/bench/better_unique.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import print_function -from pandas import DataFrame -from pandas.compat import range, zip -import timeit - -setup = """ -from pandas import Series -import pandas._tseries as _tseries -from pandas.compat import range -import random -import numpy as np - -def better_unique(values): - uniques = _tseries.fast_unique(values) - id_map = _tseries.map_indices_buf(uniques) - labels = _tseries.get_unique_labels(values, id_map) - return uniques, labels - -tot = 100000 - -def get_test_data(ngroups=100, n=tot): - unique_groups = range(ngroups) - random.shuffle(unique_groups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - return arr - -arr = get_test_data(ngroups=%d) -""" - -group_sizes = [10, 100, 1000, 10000, - 20000, 30000, 40000, - 50000, 60000, 70000, - 80000, 90000, 100000] - -numbers = [100, 100, 50] + [10] * 10 - -numpy = [] -wes = [] - -for sz, n in zip(group_sizes, numbers): - # wes_timer = timeit.Timer(stmt='better_unique(arr)', - # setup=setup % sz) - wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)', - setup=setup % sz) - - numpy_timer = timeit.Timer(stmt='np.unique(arr)', - setup=setup % sz) - - print(n) - numpy_result = numpy_timer.timeit(number=n) / n - wes_result = wes_timer.timeit(number=n) / n - - print('Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result)) - - wes.append(wes_result) - numpy.append(numpy_result) - -result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes) - - -def make_plot(numpy, wes): - pass - -# def get_test_data(ngroups=100, n=100000): -# unique_groups = range(ngroups) -# random.shuffle(unique_groups) -# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - -# if len(arr) < n: -# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], -# dtype=object) - -# return arr - -# arr = get_test_data(ngroups=1000) diff --git a/bench/duplicated.R b/bench/duplicated.R deleted file mode 100644 index eb2376df2932a..0000000000000 --- a/bench/duplicated.R +++ /dev/null @@ -1,22 +0,0 @@ -N <- 100000 - -k1 = rep(NA, N) -k2 = rep(NA, N) -for (i in 1:N){ - k1[i] <- paste(sample(letters, 1), collapse="") - k2[i] <- paste(sample(letters, 1), collapse="") -} -df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100)) -df2 <- data.frame(a=k1, b=k2) - -timings <- numeric() -timings2 <- numeric() -for (i in 1:50) { - gc() - timings[i] = system.time(deduped <- df[!duplicated(df),])[3] - gc() - timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3] -} - -mean(timings) -mean(timings2) diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py deleted file mode 100644 index d87da0ec6321a..0000000000000 --- a/bench/io_roundtrip.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import print_function -import time -import os -import numpy as np - -import la -import pandas -from pandas.compat import range -from pandas import datetools, DatetimeIndex - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def rountrip_archive(N, K=50, iterations=10): - # Create data - arr = np.random.randn(N, K) - # lar = la.larry(arr) - dma = pandas.DataFrame(arr, - DatetimeIndex('1/1/2000', periods=N, - offset=datetools.Minute())) - dma[201] = 'bar' - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - # numpy_time = timeit(numpy_f, iterations) / iterations - - # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - # larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pickle_time = timeit(pickle_f, iterations) / iterations - print('pandas (pickle) %7.4f seconds' % pickle_time) - - # print('Numpy (npz) %7.4f seconds' % numpy_time) - # print('larry (HDF5) %7.4f seconds' % larry_time) - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - # What's the best way to code this? - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) - -if __name__ == '__main__': - rountrip_archive(10000, K=200) diff --git a/bench/serialize.py b/bench/serialize.py deleted file mode 100644 index b0edd6a5752d2..0000000000000 --- a/bench/serialize.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import print_function -from pandas.compat import range, lrange -import time -import os -import numpy as np - -import la -import pandas - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def roundtrip_archive(N, iterations=10): - - # Create data - arr = np.random.randn(N, N) - lar = la.larry(arr) - dma = pandas.DataFrame(arr, lrange(N), lrange(N)) - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - numpy_time = timeit(numpy_f, iterations) / iterations - - larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - - print('Numpy (npz) %7.4f seconds' % numpy_time) - print('larry (HDF5) %7.4f seconds' % larry_time) - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) diff --git a/bench/test.py b/bench/test.py deleted file mode 100644 index 2339deab313a1..0000000000000 --- a/bench/test.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np -import itertools -import collections -import scipy.ndimage as ndi -from pandas.compat import zip, range - -N = 10000 - -lat = np.random.randint(0, 360, N) -lon = np.random.randint(0, 360, N) -data = np.random.randn(N) - - -def groupby1(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000. * lat + lon - unique_keys = np.unique(keys) - bounds = keys.searchsorted(unique_keys) - - result = group_agg(sorted_data, bounds, lambda x: x.mean()) - - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000 * lat + lon - unique_keys = np.unique(keys) - - result = ndi.mean(sorted_data, labels=keys, index=unique_keys) - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean_naive(lat, lon, data): - grouped = collections.defaultdict(list) - for lt, ln, da in zip(lat, lon, data): - grouped[(lt, ln)].append(da) - - averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items()) - - return averaged - - -def group_agg(values, bounds, f): - N = len(values) - result = np.empty(len(bounds), dtype=float) - for i, left_bound in enumerate(bounds): - if i == len(bounds) - 1: - right_bound = N - else: - right_bound = bounds[i + 1] - - result[i] = f(values[left_bound: right_bound]) - - return result - -# for i in range(10): -# groupby1(lat, lon, data) diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R deleted file mode 100644 index 294d55f51a9ab..0000000000000 --- a/bench/zoo_bench.R +++ /dev/null @@ -1,71 +0,0 @@ -library(zoo) -library(xts) -library(fts) -library(tseries) -library(its) -library(xtable) - -## indices = rep(NA, 100000) -## for (i in 1:100000) -## indices[i] <- paste(sample(letters, 10), collapse="") - - - -## x <- zoo(rnorm(100000), indices) -## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)]) - -## indices <- as.POSIXct(1:100000) - -indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100) - -sz <- 500000 - -## x <- xts(rnorm(sz), sample(indices, sz)) -## y <- xts(rnorm(sz), sample(indices, sz)) - -zoo.bench <- function(){ - x <- zoo(rnorm(sz), sample(indices, sz)) - y <- zoo(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -xts.bench <- function(){ - x <- xts(rnorm(sz), sample(indices, sz)) - y <- xts(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -fts.bench <- function(){ - x <- fts(rnorm(sz), sort(sample(indices, sz))) - y <- fts(rnorm(sz), sort(sample(indices, sz)) - timeit(function() {x + y}) -} - -its.bench <- function(){ - x <- its(rnorm(sz), sort(sample(indices, sz))) - y <- its(rnorm(sz), sort(sample(indices, sz))) - timeit(function() {x + y}) -} - -irts.bench <- function(){ - x <- irts(sort(sample(indices, sz)), rnorm(sz)) - y <- irts(sort(sample(indices, sz)), rnorm(sz)) - timeit(function() {x + y}) -} - -timeit <- function(f){ - timings <- numeric() - for (i in 1:10) { - gc() - timings[i] = system.time(f())[3] - } - mean(timings) -} - -bench <- function(){ - results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench()) - names <- c("xts", "fts", "its", "zoo") - data.frame(results, names) -} - -result <- bench() diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py deleted file mode 100644 index 74cb1952a5a2a..0000000000000 --- a/bench/zoo_bench.py +++ /dev/null @@ -1,36 +0,0 @@ -from pandas import * -from pandas.util.testing import rands - -n = 1000000 -# indices = Index([rands(10) for _ in xrange(n)]) - - -def sample(values, k): - sampler = np.random.permutation(len(values)) - return values.take(sampler[:k]) -sz = 500000 -rng = np.arange(0, 10000000000000, 10000000) -stamps = np.datetime64(datetime.now()).view('i8') + rng -idx1 = np.sort(sample(stamps, sz)) -idx2 = np.sort(sample(stamps, sz)) -ts1 = Series(np.random.randn(sz), idx1) -ts2 = Series(np.random.randn(sz), idx2) - - -# subsample_size = 90000 - -# x = Series(np.random.randn(100000), indices) -# y = Series(np.random.randn(subsample_size), -# index=sample(indices, subsample_size)) - - -# lx = larry(np.random.randn(100000), [list(indices)]) -# ly = larry(np.random.randn(subsample_size), [list(y.index)]) - -# Benchmark 1: Two 1-million length time series (int64-based index) with -# randomly chosen timestamps - -# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join) - -# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5)) -# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10)) diff --git a/ci/after_script.sh b/ci/after_script.sh deleted file mode 100755 index b17d69daa5b8d..0000000000000 --- a/ci/after_script.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -#wget https://raw.github.com/y-p/ScatterCI-CLI/master/scatter_cli.py -#chmod u+x scatter_cli.py - -pip install -I requests==2.1.0 -echo "${TRAVIS_PYTHON_VERSION:0:4}" -if [ x"${TRAVIS_PYTHON_VERSION:0:4}" == x"2.6" ]; then - pip install simplejson; -fi - -# ScatterCI accepts a build log, but currently does nothing with it. -echo '' > /tmp/build.log - -# nore exposed in the build logs -#export SCATTERCI_ACCESS_KEY= -#export SCATTERCI_HOST= - -# Generate a json file describing system and dep versions -ci/print_versions.py -j /tmp/env.json - -# nose ran using "--with-xunit --xunit-file nosetest.xml" and generated /tmp/nosetest.xml -# Will timeout if server not available, and should not fail the build -#python scatter_cli.py --xunit-file /tmp/nosetests.xml --log-file /tmp/build.log --env-file /tmp/env.json --build-name "$JOB_NAME" --succeed - -true # never fail because bad things happened here diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml new file mode 100644 index 0000000000000..10511ac0e00ca --- /dev/null +++ b/ci/appveyor-27.yaml @@ -0,0 +1,30 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - dateutil + - gcsfs + - html5lib + - jinja2=2.8 + - lxml + - matplotlib + - numexpr + - numpy=1.12* + - openpyxl + - pytables + - python=2.7.* + - pytz + - s3fs + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + # universal + - cython + - pytest + - pytest-xdist + - moto diff --git a/ci/appveyor-36.yaml b/ci/appveyor-36.yaml new file mode 100644 index 0000000000000..868724419c464 --- /dev/null +++ b/ci/appveyor-36.yaml @@ -0,0 +1,27 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - blosc + - bottleneck + - fastparquet + - feather-format + - matplotlib + - numexpr + - numpy=1.14* + - openpyxl + - pyarrow + - pytables + - python-dateutil + - python=3.6.* + - pytz + - scipy + - thrift=0.10* + - xlrd + - xlsxwriter + - xlwt + # universal + - cython + - pytest + - pytest-xdist diff --git a/ci/before_install.sh b/ci/before_install.sh deleted file mode 100755 index e4376e1bf21c2..0000000000000 --- a/ci/before_install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# If envars.sh determined we're running in an authorized fork -# and the user opted in to the network cache,and that cached versions -# are available on the cache server, download and deploy the cached -# files to the local filesystem - -echo "inside $0" - -# overview -sudo apt-get update $APT_ARGS # run apt-get update for all versions - -true # never fail because bad things happened here diff --git a/ci/before_script_travis.sh b/ci/before_script_travis.sh new file mode 100755 index 0000000000000..0b3939b1906a2 --- /dev/null +++ b/ci/before_script_travis.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "inside $0" + +if [ "${TRAVIS_OS_NAME}" == "linux" ]; then + sh -e /etc/init.d/xvfb start + sleep 3 +fi + +# Never fail because bad things happened here. +true diff --git a/ci/build_docs.sh b/ci/build_docs.sh index a8488e202dbec..90a666dc34ed7 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -1,42 +1,50 @@ #!/bin/bash +if [ "${TRAVIS_OS_NAME}" != "linux" ]; then + echo "not doing build_docs on non-linux" + exit 0 +fi + cd "$TRAVIS_BUILD_DIR" echo "inside $0" git show --pretty="format:" --name-only HEAD~5.. --first-parent | grep -P "rst|txt|doc" -if [ "$?" != "0" ]; then - echo "Skipping doc build, none were modified" - # nope, skip docs build - exit 0 -fi +# if [ "$?" != "0" ]; then +# echo "Skipping doc build, none were modified" +# # nope, skip docs build +# exit 0 +# fi -if [ x"$DOC_BUILD" != x"" ]; then +if [ "$DOC" ]; then - # we're running network tests, let's build the docs in the meantime echo "Will build docs" - conda install -n pandas sphinx=1.1.3 pygments ipython=2.4 --yes source activate pandas mv "$TRAVIS_BUILD_DIR"/doc /tmp + mv "$TRAVIS_BUILD_DIR/LICENSE" /tmp # included in the docs. cd /tmp/doc - rm /tmp/doc/source/api.rst # no R - rm /tmp/doc/source/r_interface.rst # no R - echo ############################### echo # Log file for the doc build # echo ############################### - echo -e "y\n" | ./make.py --no-api 2>&1 + echo ./make.py + ./make.py + + echo ######################## + echo # Create and send docs # + echo ######################## cd /tmp/doc/build/html git config --global user.email "pandas-docs-bot@localhost.foo" git config --global user.name "pandas-docs-bot" + # create the repo git init + touch README git add README git commit -m "Initial commit" --allow-empty @@ -45,8 +53,22 @@ if [ x"$DOC_BUILD" != x"" ]; then touch .nojekyll git add --all . git commit -m "Version" --allow-empty - git remote add origin https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis + + git remote remove origin + git remote add origin "https://${PANDAS_GH_TOKEN}@github.com/pandas-dev/pandas-docs-travis.git" + git fetch origin + git remote -v + git push origin gh-pages -f + + echo "Running doctests" + cd "$TRAVIS_BUILD_DIR" + pytest --doctest-modules \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py + fi exit 0 diff --git a/ci/check_cache.sh b/ci/check_cache.sh new file mode 100755 index 0000000000000..b83144fc45ef4 --- /dev/null +++ b/ci/check_cache.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# currently not used +# script to make sure that cache is clean +# Travis CI now handles this + +if [ "$TRAVIS_PULL_REQUEST" == "false" ] +then + echo "Not a PR: checking for changes in ci/ from last 2 commits" + git diff HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) +else + echo "PR: checking for changes in ci/ from last 2 commits" + git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD + git diff PR_HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) +fi + +CACHE_DIR="$HOME/.cache/" +CCACHE_DIR="$HOME/.ccache/" + +if [ $ci_changes -ne 0 ] +then + echo "Files have changed in ci/ deleting all caches" + rm -rf "$CACHE_DIR" + rm -rf "$CCACHE_DIR" +fi diff --git a/ci/check_imports.py b/ci/check_imports.py new file mode 100644 index 0000000000000..3f09290f8c375 --- /dev/null +++ b/ci/check_imports.py @@ -0,0 +1,36 @@ +""" +Check that certain modules are not loaded by `import pandas` +""" +import sys + +blacklist = { + 'bs4', + 'gcsfs', + 'html5lib', + 'ipython', + 'jinja2' + 'lxml', + 'numexpr', + 'openpyxl', + 'py', + 'pytest', + 's3fs', + 'scipy', + 'tables', + 'xlrd', + 'xlsxwriter', + 'xlwt', +} + + +def main(): + import pandas # noqa + + modules = set(x.split('.')[0] for x in sys.modules) + imported = modules & blacklist + if modules & blacklist: + sys.exit("Imported {}".format(imported)) + + +if __name__ == '__main__': + main() diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml new file mode 100644 index 0000000000000..81a48d4edf11c --- /dev/null +++ b/ci/circle-27-compat.yaml @@ -0,0 +1,28 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - bottleneck=1.0.0 + - cython=0.24 + - jinja2=2.8 + - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr + - numpy=1.9.2 + - openpyxl + - psycopg2 + - pytables=3.2.2 + - python-dateutil=2.5.0 + - python=2.7* + - pytz=2013b + - scipy=0.14.0 + - sqlalchemy=0.7.8 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - pip: + - html5lib==1.0b2 + - beautifulsoup4==4.2.1 + - pymysql==0.6.0 diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml new file mode 100644 index 0000000000000..602c414b49bb2 --- /dev/null +++ b/ci/circle-35-ascii.yaml @@ -0,0 +1,13 @@ +name: pandas +channels: + - defaults +dependencies: + - cython + - nomkl + - numpy + - python-dateutil + - python=3.5* + - pytz + # universal + - pytest + - pytest-xdist diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml new file mode 100644 index 0000000000000..cc852c1e2aeeb --- /dev/null +++ b/ci/circle-36-locale.yaml @@ -0,0 +1,33 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml new file mode 100644 index 0000000000000..f44e98e1ee09d --- /dev/null +++ b/ci/circle-36-locale_slow.yaml @@ -0,0 +1,34 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython + - gcsfs + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh deleted file mode 100755 index 89659577d0e7f..0000000000000 --- a/ci/cron/go_doc.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# This is a one-command cron job for setting up -# a virtualenv-based, linux-based, py2-based environment -# for building the Pandas documentation. -# -# The first run will install all required deps from pypi -# into the venv including monsters like scipy. -# You may want to set it up yourself to speed up the -# process. -# -# This is meant to be run as a cron job under a dedicated -# user account whose HOME directory contains this script. -# a CI directory will be created under it and all files -# stored within it. -# -# The hardcoded dep versions will gradually become obsolete -# You may need to tweak them -# -# @y-p, Jan/2014 - -# disto latex is sometimes finicky. Optionall use -# a local texlive install -export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH - -# Having ccache will speed things up -export PATH=/usr/lib64/ccache/:$PATH - -# limit disk usage -ccache -M 200M - -BASEDIR="$HOME/CI" -REPO_URL="https://github.com/pydata/pandas" -REPO_LOC="$BASEDIR/pandas" - -if [ ! -d $BASEDIR ]; then - mkdir -p $BASEDIR - virtualenv $BASEDIR/venv -fi - -source $BASEDIR/venv/bin/activate - -pip install numpy==1.7.2 -pip install cython==0.20.0 -pip install python-dateutil==2.2 -pip install --pre pytz==2013.9 -pip install sphinx==1.1.3 -pip install numexpr==2.2.2 - -pip install matplotlib==1.3.0 -pip install lxml==3.2.5 -pip install beautifulsoup4==4.3.2 -pip install html5lib==0.99 - -# You'll need R as well -pip install rpy2==2.3.9 - -pip install tables==3.0.0 -pip install bottleneck==0.7.0 -pip install ipython==0.13.2 - -# only if you have too -pip install scipy==0.13.2 - -pip install openpyxl==1.6.2 -pip install xlrd==0.9.2 -pip install xlwt==0.7.5 -pip install xlsxwriter==0.5.1 -pip install sqlalchemy==0.8.3 - -if [ ! -d "$REPO_LOC" ]; then - git clone "$REPO_URL" "$REPO_LOC" -fi - -cd "$REPO_LOC" -git reset --hard -git clean -df -git checkout master -git pull origin -make - -source $BASEDIR/venv/bin/activate -export PATH="/usr/lib64/ccache/:$PATH" -pip uninstall pandas -yq -pip install "$REPO_LOC" - -cd "$REPO_LOC"/doc - -python make.py clean -python make.py html -if [ ! $? == 0 ]; then - exit 1 -fi -python make.py zip_html -# usually requires manual intervention -# python make.py latex - -# If you have access: -# python make.py upload_dev diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml new file mode 100644 index 0000000000000..5733857b55dd4 --- /dev/null +++ b/ci/environment-dev.yaml @@ -0,0 +1,16 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - Cython + - NumPy + - flake8 + - moto + - pytest>=3.1 + - python-dateutil>=2.5.0 + - python=3 + - pytz + - setuptools>=24.2.0 + - sphinx + - sphinxcontrib-spelling diff --git a/ci/install.ps1 b/ci/install.ps1 new file mode 100644 index 0000000000000..64ec7f81884cd --- /dev/null +++ b/ci/install.ps1 @@ -0,0 +1,92 @@ +# Sample script to install Miniconda under Windows +# Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon +# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ + +$MINICONDA_URL = "http://repo.continuum.io/miniconda/" + + +function DownloadMiniconda ($python_version, $platform_suffix) { + $webclient = New-Object System.Net.WebClient + $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" + $url = $MINICONDA_URL + $filename + + $basedir = $pwd.Path + "\" + $filepath = $basedir + $filename + if (Test-Path $filename) { + Write-Host "Reusing" $filepath + return $filepath + } + + # Download and retry up to 3 times in case of network transient errors. + Write-Host "Downloading" $filename "from" $url + $retry_attempts = 2 + for($i=0; $i -lt $retry_attempts; $i++){ + try { + $webclient.DownloadFile($url, $filepath) + break + } + Catch [Exception]{ + Start-Sleep 1 + } + } + if (Test-Path $filepath) { + Write-Host "File saved at" $filepath + } else { + # Retry once to get the error message if any at the last try + $webclient.DownloadFile($url, $filepath) + } + return $filepath +} + + +function InstallMiniconda ($python_version, $architecture, $python_home) { + Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home + if (Test-Path $python_home) { + Write-Host $python_home "already exists, skipping." + return $false + } + if ($architecture -match "32") { + $platform_suffix = "x86" + } else { + $platform_suffix = "x86_64" + } + + $filepath = DownloadMiniconda $python_version $platform_suffix + Write-Host "Installing" $filepath "to" $python_home + $install_log = $python_home + ".log" + $args = "/S /D=$python_home" + Write-Host $filepath $args + Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru + if (Test-Path $python_home) { + Write-Host "Python $python_version ($architecture) installation complete" + } else { + Write-Host "Failed to install Python in $python_home" + Get-Content -Path $install_log + Exit 1 + } +} + + +function InstallCondaPackages ($python_home, $spec) { + $conda_path = $python_home + "\Scripts\conda.exe" + $args = "install --yes " + $spec + Write-Host ("conda " + $args) + Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru +} + +function UpdateConda ($python_home) { + $conda_path = $python_home + "\Scripts\conda.exe" + Write-Host "Updating conda..." + $args = "update --yes conda" + Write-Host $conda_path $args + Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru +} + + +function main () { + InstallMiniconda "3.5" $env:PYTHON_ARCH $env:CONDA_ROOT + UpdateConda $env:CONDA_ROOT + InstallCondaPackages $env:CONDA_ROOT "conda-build jinja2 anaconda-client" +} + +main diff --git a/ci/install_appveyor.ps1 b/ci/install_appveyor.ps1 deleted file mode 100644 index a022995dc7d58..0000000000000 --- a/ci/install_appveyor.ps1 +++ /dev/null @@ -1,133 +0,0 @@ -# Sample script to install Miniconda under Windows -# Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon -# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ - -$MINICONDA_URL = "http://repo.continuum.io/miniconda/" - - -function DownloadMiniconda ($python_version, $platform_suffix) { - $webclient = New-Object System.Net.WebClient - if ($python_version -match "3.4") { - $filename = "Miniconda3-3.5.5-Windows-" + $platform_suffix + ".exe" - } else { - $filename = "Miniconda-3.5.5-Windows-" + $platform_suffix + ".exe" - } - $url = $MINICONDA_URL + $filename - - $basedir = $pwd.Path + "\" - $filepath = $basedir + $filename - if (Test-Path $filename) { - Write-Host "Reusing" $filepath - return $filepath - } - - # Download and retry up to 3 times in case of network transient errors. - Write-Host "Downloading" $filename "from" $url - $retry_attempts = 2 - for($i=0; $i -lt $retry_attempts; $i++){ - try { - $webclient.DownloadFile($url, $filepath) - break - } - Catch [Exception]{ - Start-Sleep 1 - } - } - if (Test-Path $filepath) { - Write-Host "File saved at" $filepath - } else { - # Retry once to get the error message if any at the last try - $webclient.DownloadFile($url, $filepath) - } - return $filepath -} - -function Start-Executable { - param( - [String] $FilePath, - [String[]] $ArgumentList - ) - $OFS = " " - $process = New-Object System.Diagnostics.Process - $process.StartInfo.FileName = $FilePath - $process.StartInfo.Arguments = $ArgumentList - $process.StartInfo.UseShellExecute = $false - $process.StartInfo.RedirectStandardOutput = $true - if ( $process.Start() ) { - $output = $process.StandardOutput.ReadToEnd() ` - -replace "\r\n$","" - if ( $output ) { - if ( $output.Contains("`r`n") ) { - $output -split "`r`n" - } - elseif ( $output.Contains("`n") ) { - $output -split "`n" - } - else { - $output - } - } - $process.WaitForExit() - & "$Env:SystemRoot\system32\cmd.exe" ` - /c exit $process.ExitCode - } - } - -function InstallMiniconda ($python_version, $architecture, $python_home) { - Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home - if (Test-Path $python_home) { - Write-Host $python_home "already exists, skipping." - return $false - } - if ($architecture -match "32") { - $platform_suffix = "x86" - } else { - $platform_suffix = "x86_64" - } - - $filepath = DownloadMiniconda $python_version $platform_suffix - Write-Host "Installing" $filepath "to" $python_home - $install_log = $python_home + ".log" - $args = "/S /D=$python_home" - Write-Host $filepath $args - Start-Process -FilePath $filepath -ArgumentList $args -Wait - if (Test-Path $python_home) { - Write-Host "Python $python_version ($architecture) installation complete" - } else { - Write-Host "Failed to install Python in $python_home" - Get-Content -Path $install_log - Exit 1 - } -} - - -function InstallCondaPackages ($python_home, $spec) { - $conda_path = $python_home + "\Scripts\conda.exe" - $args = "install --yes --quiet " + $spec - Write-Host ("conda " + $args) - Start-Executable -FilePath "$conda_path" -ArgumentList $args -} -function InstallCondaPackagesFromFile ($python_home, $ver, $arch) { - $conda_path = $python_home + "\Scripts\conda.exe" - $args = "install --yes --quiet --file " + $env:APPVEYOR_BUILD_FOLDER + "\ci\requirements-" + $ver + "_" + $arch + ".txt" - Write-Host ("conda " + $args) - Start-Executable -FilePath "$conda_path" -ArgumentList $args -} - -function UpdateConda ($python_home) { - $conda_path = $python_home + "\Scripts\conda.exe" - Write-Host "Updating conda..." - $args = "update --yes conda" - Write-Host $conda_path $args - Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -} - - -function main () { - InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON - UpdateConda $env:PYTHON - InstallCondaPackages $env:PYTHON "pip setuptools nose" - InstallCondaPackagesFromFile $env:PYTHON $env:PYTHON_VERSION $env:PYTHON_ARCH -} - -main \ No newline at end of file diff --git a/ci/install_circle.sh b/ci/install_circle.sh new file mode 100755 index 0000000000000..5ffff84c88488 --- /dev/null +++ b/ci/install_circle.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +home_dir=$(pwd) +echo "[home_dir: $home_dir]" + +echo "[ls -ltr]" +ls -ltr + +echo "[Using clean Miniconda install]" +rm -rf "$MINICONDA_DIR" + +# install miniconda +wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 +bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 + +export PATH="$MINICONDA_DIR/bin:$PATH" + +echo "[update conda]" +conda config --set ssl_verify false || exit 1 +conda config --set always_yes true --set changeps1 false || exit 1 +conda update -q conda + +# add the pandas channel to take priority +# to add extra packages +echo "[add channels]" +conda config --add channels pandas || exit 1 +conda config --remove channels defaults || exit 1 +conda config --add channels defaults || exit 1 + +# Useful for debugging any issues with conda +conda info -a || exit 1 + +# support env variables passed +export ENVS_FILE=".envs" + +# make sure that the .envs file exists. it is ok if it is empty +touch $ENVS_FILE + +# assume all command line arguments are environmental variables +for var in "$@" +do + echo "export $var" >> $ENVS_FILE +done + +echo "[environmental variable file]" +cat $ENVS_FILE +source $ENVS_FILE + +# edit the locale override if needed +if [ -n "$LOCALE_OVERRIDE" ]; then + echo "[Adding locale to the first line of pandas/__init__.py]" + rm -f pandas/__init__.pyc + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sed -i "$sedc" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" + head -4 pandas/__init__.py + echo +fi + +# create envbuild deps +echo "[create env]" +time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 + +source activate pandas + +# remove any installed pandas package +# w/o removing anything else +echo +echo "[removing installed pandas]" +conda remove pandas -y --force +pip uninstall -y pandas + +# build but don't install +echo "[build em]" +time python setup.py build_ext --inplace || exit 1 + +echo +echo "[show environment]" + +conda list diff --git a/ci/install_conda.sh b/ci/install_conda.sh deleted file mode 100755 index 8d99034a86109..0000000000000 --- a/ci/install_conda.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash - -# There are 2 distinct pieces that get zipped and cached -# - The venv site-packages dir including the installed dependencies -# - The pandas build artifacts, using the build cache support via -# scripts/use_build_cache.py -# -# if the user opted in to use the cache and we're on a whitelisted fork -# - if the server doesn't hold a cached version of venv/pandas build, -# do things the slow way, and put the results on the cache server -# for the next time. -# - if the cache files are available, instal some necessaries via apt -# (no compiling needed), then directly goto script and collect 200$. -# - -function edit_init() -{ - if [ -n "$LOCALE_OVERRIDE" ]; then - echo "Adding locale to the first line of pandas/__init__.py" - rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" - sed -i "$sedc" pandas/__init__.py - echo "head -4 pandas/__init__.py" - head -4 pandas/__init__.py - echo - fi -} - -edit_init - -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" - -home_dir=$(pwd) -echo "home_dir: [$home_dir]" - -if [ -n "$LOCALE_OVERRIDE" ]; then - # make sure the locale is available - # probably useless, since you would need to relogin - time sudo locale-gen "$LOCALE_OVERRIDE" -fi - -# Need to enable for locale testing. The location of the locale file(s) is -# distro specific. For example, on Arch Linux all of the locales are in a -# commented file--/etc/locale.gen--that must be commented in to be used -# whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales -# based on what's in the files in that folder -time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it -time sudo locale-gen - - -# install gui for clipboard testing -if [ -n "$CLIPBOARD_GUI" ]; then - echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" - [ -n "$python_major_version" ] && py="py" - python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} - time sudo apt-get $APT_ARGS install $python_cb_gui_pkg -fi - - -# install a clipboard if $CLIPBOARD is not empty -if [ -n "$CLIPBOARD" ]; then - echo "Using clipboard: $CLIPBOARD" - time sudo apt-get $APT_ARGS install $CLIPBOARD -fi - -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" - -wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 -bash miniconda.sh -b -p $HOME/miniconda || exit 1 - -conda config --set always_yes yes --set changeps1 no || exit 1 -conda update -q conda || exit 1 -conda config --add channels conda-forge || exit 1 -conda config --add channels http://conda.binstar.org/pandas || exit 1 -conda config --set ssl_verify false || exit 1 - -# Useful for debugging any issues with conda -conda info -a || exit 1 - -# build deps -REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.build" -time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose || exit 1 -time conda install -n pandas --file=${REQ} || exit 1 - -source activate pandas - -# set the compiler cache to work -if [ "$IRON_TOKEN" ]; then - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - ccache=$(which ccache) - echo "ccache: $ccache" - export CC='ccache gcc' -fi - -if [ "$BUILD_TEST" ]; then - - # build testing - pip uninstall --yes cython - pip install cython==0.15.1 - ( python setup.py build_ext --inplace && python setup.py develop ) || true - -else - - # build but don't install - time python setup.py build_ext --inplace || exit 1 - - # we may have run installations - REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.run" - time conda install -n pandas --file=${REQ} || exit 1 - - # we may have additional pip installs - REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.pip" - if [ -e ${REQ} ]; then - pip install -r $REQ - fi - - # remove any installed pandas package - conda remove pandas - - # install our pandas - python setup.py develop || exit 1 - -fi - -true diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh new file mode 100755 index 0000000000000..a00f74f009f54 --- /dev/null +++ b/ci/install_db_circle.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "installing dbs" +mysql -e 'create database pandas_nosetest;' +psql -c 'create database pandas_nosetest;' -U postgres + +echo "done" +exit 0 diff --git a/ci/install_db_travis.sh b/ci/install_db_travis.sh new file mode 100755 index 0000000000000..e4e6d7a5a9b85 --- /dev/null +++ b/ci/install_db_travis.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [ "${TRAVIS_OS_NAME}" != "linux" ]; then + echo "not using dbs on non-linux" + exit 0 +fi + +echo "installing dbs" +mysql -e 'create database pandas_nosetest;' +psql -c 'create database pandas_nosetest;' -U postgres + +echo "done" +exit 0 diff --git a/ci/install_pydata.sh b/ci/install_pydata.sh deleted file mode 100755 index 667b57897be7e..0000000000000 --- a/ci/install_pydata.sh +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/bash - -# There are 2 distinct pieces that get zipped and cached -# - The venv site-packages dir including the installed dependencies -# - The pandas build artifacts, using the build cache support via -# scripts/use_build_cache.py -# -# if the user opted in to use the cache and we're on a whitelisted fork -# - if the server doesn't hold a cached version of venv/pandas build, -# do things the slow way, and put the results on the cache server -# for the next time. -# - if the cache files are available, instal some necessaries via apt -# (no compiling needed), then directly goto script and collect 200$. -# - -function edit_init() -{ - if [ -n "$LOCALE_OVERRIDE" ]; then - echo "Adding locale to the first line of pandas/__init__.py" - rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" - sed -i "$sedc" pandas/__init__.py - echo "head -4 pandas/__init__.py" - head -4 pandas/__init__.py - echo - fi -} - -edit_init - -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" - -home_dir=$(pwd) -echo "home_dir: [$home_dir]" - -# known working -# pip==1.5.1 -# setuptools==2.2 -# wheel==0.22 -# nose==1.3.3 - -pip install -I -U pip -pip install -I -U setuptools -pip install wheel==0.22 -#pip install nose==1.3.3 -pip install nose==1.3.4 - -# comment this line to disable the fetching of wheel files -base_url=http://pandas.pydata.org/pandas-build/dev/wheels - -wheel_box=${TRAVIS_PYTHON_VERSION}${JOB_TAG} -PIP_ARGS+=" -I --use-wheel --find-links=$base_url/$wheel_box/ --allow-external --allow-insecure" - -if [ -n "$LOCALE_OVERRIDE" ]; then - # make sure the locale is available - # probably useless, since you would need to relogin - time sudo locale-gen "$LOCALE_OVERRIDE" -fi - -# we need these for numpy -time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran - -if [ -n "$NUMPY_BUILD" ]; then - # building numpy - - cd $home_dir - echo "cloning numpy" - - rm -Rf /tmp/numpy - cd /tmp - - # remove the system installed numpy - pip uninstall numpy -y - - # install cython - pip install --find-links http://wheels.astropy.org/ --find-links http://wheels2.astropy.org/ --use-wheel Cython - - # clone & install - git clone --branch $NUMPY_BUILD https://github.com/numpy/numpy.git numpy - cd numpy - time pip install . - pip uninstall cython -y - - cd $home_dir - numpy_version=$(python -c 'import numpy; print(numpy.__version__)') - echo "[$home_dir] numpy current: $numpy_version" -fi - -# Force virtualenv to accept system_site_packages -rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt - -# build deps -time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.build - -# Need to enable for locale testing. The location of the locale file(s) is -# distro specific. For example, on Arch Linux all of the locales are in a -# commented file--/etc/locale.gen--that must be commented in to be used -# whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales -# based on what's in the files in that folder -time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it -time sudo locale-gen - - -# install gui for clipboard testing -if [ -n "$CLIPBOARD_GUI" ]; then - echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" - [ -n "$python_major_version" ] && py="py" - python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} - time sudo apt-get $APT_ARGS install $python_cb_gui_pkg -fi - - -# install a clipboard if $CLIPBOARD is not empty -if [ -n "$CLIPBOARD" ]; then - echo "Using clipboard: $CLIPBOARD" - time sudo apt-get $APT_ARGS install $CLIPBOARD -fi - - -# Optional Deps -if [ -n "$FULL_DEPS" ]; then - echo "Installing FULL_DEPS" - - # need libhdf5 for PyTables - time sudo apt-get $APT_ARGS install libhdf5-serial-dev -fi - - -# set the compiler cache to work -if [ "$IRON_TOKEN" ]; then - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - ccache=$(which ccache) - echo "ccache: $ccache" - export CC='ccache gcc' -fi - -# build pandas -if [ "$BUILD_TEST" ]; then - pip uninstall --yes cython - pip install cython==0.15.1 - ( python setup.py build_ext --inplace ) || true - ( python setup.py develop ) || true -else - python setup.py build_ext --inplace - python setup.py develop -fi - -# install the run libs -time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.run - -# restore cython (if not numpy building) -if [ -z "$NUMPY_BUILD" ]; then - time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) -fi - -true diff --git a/ci/install_test.sh b/ci/install_test.sh deleted file mode 100755 index e01ad7b94a349..0000000000000 --- a/ci/install_test.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -echo "inside $0" - -if [ "$INSTALL_TEST" ]; then - source activate pandas - echo "Starting installation test." - conda uninstall cython || exit 1 - python "$TRAVIS_BUILD_DIR"/setup.py sdist --formats=zip,gztar || exit 1 - pip install "$TRAVIS_BUILD_DIR"/dist/*tar.gz || exit 1 - nosetests --exe -A "$NOSE_ARGS" pandas/tests/test_series.py --with-xunit --xunit-file=/tmp/nosetests_install.xml -else - echo "Skipping installation test." -fi -RET="$?" - -exit "$RET" diff --git a/ci/install_travis.sh b/ci/install_travis.sh new file mode 100755 index 0000000000000..fd4a36f86db6c --- /dev/null +++ b/ci/install_travis.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# edit the locale file if needed +function edit_init() +{ + if [ -n "$LOCALE_OVERRIDE" ]; then + echo "[Adding locale to the first line of pandas/__init__.py]" + rm -f pandas/__init__.pyc + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sed -i "$sedc" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" + head -4 pandas/__init__.py + echo + fi +} + +echo +echo "[install_travis]" +edit_init + +home_dir=$(pwd) +echo +echo "[home_dir]: $home_dir" + +# install miniconda +MINICONDA_DIR="$HOME/miniconda3" + +echo +echo "[Using clean Miniconda install]" + +if [ -d "$MINICONDA_DIR" ]; then + rm -rf "$MINICONDA_DIR" +fi + +# install miniconda +if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -q -O miniconda.sh || exit 1 +else + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 +fi +time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 + +echo +echo "[show conda]" +which conda + +echo +echo "[update conda]" +conda config --set ssl_verify false || exit 1 +conda config --set quiet true --set always_yes true --set changeps1 false || exit 1 +conda update -q conda + +# Useful for debugging any issues with conda +conda info -a || exit 1 + +# set the compiler cache to work +echo +if [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then + echo "[Using ccache]" + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "[gcc]: $gcc" + ccache=$(which ccache) + echo "[ccache]: $ccache" + export CC='ccache gcc' +elif [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then + echo "[Install ccache]" + brew install ccache > /dev/null 2>&1 + echo "[Using ccache]" + export PATH=/usr/local/opt/ccache/libexec:$PATH + gcc=$(which gcc) + echo "[gcc]: $gcc" + ccache=$(which ccache) + echo "[ccache]: $ccache" +else + echo "[Not using ccache]" +fi + +echo +echo "[create env]" + +# create our environment +time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 + +source activate pandas + +# remove any installed pandas package +# w/o removing anything else +echo +echo "[removing installed pandas]" +conda remove pandas -y --force +pip uninstall -y pandas + +echo +echo "[no installed pandas]" +conda list pandas +pip list --format columns |grep pandas + +# build and install +echo "[running setup.py develop]" +python setup.py develop || exit 1 + +echo +echo "[show environment]" +conda list + +echo +echo "[done]" +exit 0 diff --git a/ci/ironcache/get.py b/ci/ironcache/get.py deleted file mode 100644 index a4663472b955c..0000000000000 --- a/ci/ironcache/get.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import re -import os -import time -import json -import base64 -from hashlib import sha1 -from iron_cache import * -import traceback as tb - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) -print(key) - -if sys.version_info[0] > 2: - key = bytes(key,encoding='utf8') - -key = sha1(key).hexdigest()[:8]+'.' - -b = b'' -cache = IronCache() -for i in range(20): - print("getting %s" % key+str(i)) - try: - item = cache.get(cache="travis", key=key+str(i)) - v = item.value - if sys.version_info[0] > 2: - v = bytes(v,encoding='utf8') - b += bytes(base64.b64decode(v)) - except Exception as e: - try: - print(tb.format_exc(e)) - except: - print("exception during exception, oh my") - break - -with open(os.path.join(os.environ.get('HOME',''),"ccache.7z"),'wb') as f: - f.write(b) diff --git a/ci/ironcache/put.py b/ci/ironcache/put.py deleted file mode 100644 index f6aef3a327e87..0000000000000 --- a/ci/ironcache/put.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -import re -import os -import time -import json -import base64 -from hashlib import sha1 -from iron_cache import * - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) - -key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), - os.environ.get('JOB_NAME','unk')) -print(key) - -if sys.version_info[0] > 2: - key = bytes(key,encoding='utf8') - -key = sha1(key).hexdigest()[:8]+'.' - -os.chdir(os.environ.get('HOME')) - -cache = IronCache() - -i=0 - -for i, fname in enumerate(sorted([x for x in os.listdir('.') if re.match("ccache.\d+$",x)])): - print("Putting %s" % key+str(i)) - with open(fname,"rb") as f: - s= f.read() - value=base64.b64encode(s) - if isinstance(value, bytes): - value = value.decode('ascii') - item = cache.put(cache="travis", key=key+str(i), value=value,options=dict(expires_in=24*60*60)) - -# print("foo") -for i in range(i+1,20): - - try: - item = cache.delete(key+str(i),cache='travis') - print("Deleted %s" % key+str(i)) - except: - break - pass diff --git a/ci/lint.sh b/ci/lint.sh new file mode 100755 index 0000000000000..9bcee55e1344c --- /dev/null +++ b/ci/lint.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +echo "inside $0" + +source activate pandas + +RET=0 + +if [ "$LINT" ]; then + + # pandas/_libs/src is C code, so no need to search there. + echo "Linting *.py" + flake8 pandas --filename=*.py --exclude pandas/_libs/src + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting *.py DONE" + + echo "Linting setup.py" + flake8 setup.py + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting setup.py DONE" + + echo "Linting asv_bench/benchmarks/" + flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/*.py --ignore=F811 + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting asv_bench/benchmarks/*.py DONE" + + echo "Linting scripts/*.py" + flake8 scripts --filename=*.py + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting scripts/*.py DONE" + + echo "Linting doc scripts" + flake8 doc/make.py doc/source/conf.py + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting doc scripts DONE" + + echo "Linting *.pyx" + flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403 + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting *.pyx DONE" + + echo "Linting *.pxi.in" + for path in 'src' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename=*.pxi.in --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 + if [ $? -ne "0" ]; then + RET=1 + fi + done + echo "Linting *.pxi.in DONE" + + echo "Linting *.pxd" + for path in '_libs' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename=*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 + if [ $? -ne "0" ]; then + RET=1 + fi + done + echo "Linting *.pxd DONE" + + # readability/casting: Warnings about C casting instead of C++ casting + # runtime/int: Warnings about using C number types instead of C++ ones + # build/include_subdir: Warnings about prefacing included header files with directory + + # We don't lint all C files because we don't want to lint any that are built + # from Cython files nor do we want to lint C files that we didn't modify for + # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, + # we can lint all header files since they aren't "generated" like C files are. + echo "Linting *.c and *.h" + for path in '*.h' 'period_helper.c' 'datetime' 'parser' 'ujson' + do + echo "linting -> pandas/_libs/src/$path" + cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/src/$path + if [ $? -ne "0" ]; then + RET=1 + fi + done + echo "Linting *.c and *.h DONE" + + echo "Check for invalid testing" + + # Check for the following code in testing: + # + # np.testing + # np.array_equal + grep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ + + if [ $? = "0" ]; then + RET=1 + fi + + # Check for pytest.warns + grep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ + + if [ $? = "0" ]; then + RET=1 + fi + + # Check for the following code in the extension array base tests + # tm.assert_frame_equal + # tm.assert_series_equal + grep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base + + if [ $? = "0" ]; then + RET=1 + fi + + echo "Check for invalid testing DONE" + + # Check for imports from pandas.core.common instead + # of `import pandas.core.common as com` + echo "Check for non-standard imports" + grep -R --include="*.py*" -E "from pandas.core.common import " pandas + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for non-standard imports DONE" + + echo "Check for use of lists instead of generators in built-in Python functions" + + # Example: Avoid `any([i for i in some_iterator])` in favor of `any(i for i in some_iterator)` + # + # Check the following functions: + # any(), all(), sum(), max(), min(), list(), dict(), set(), frozenset(), tuple(), str.join() + grep -R --include="*.py*" -E "[^_](any|all|sum|max|min|list|dict|set|frozenset|tuple|join)\(\[.* for .* in .*\]\)" pandas + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for use of lists instead of generators in built-in Python functions DONE" + + echo "Check for incorrect sphinx directives" + SPHINX_DIRECTIVES=$(echo \ + "autosummary|contents|currentmodule|deprecated|function|image|"\ + "important|include|ipython|literalinclude|math|module|note|raw|"\ + "seealso|toctree|versionadded|versionchanged|warning" | tr -d "[:space:]") + for path in './pandas' './doc/source' + do + grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. ($SPHINX_DIRECTIVES):[^:]" $path + if [ $? = "0" ]; then + RET=1 + fi + done + echo "Check for incorrect sphinx directives DONE" + + echo "Check for deprecated messages without sphinx directive" + grep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for deprecated messages without sphinx directive DONE" + + echo "Check for old-style classes" + grep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for old-style classes DONE" + + echo "Check for backticks incorrectly rendering because of missing spaces" + grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for backticks incorrectly rendering because of missing spaces DONE" + +else + echo "NOT Linting" +fi + +exit $RET diff --git a/ci/prep_ccache.sh b/ci/prep_ccache.sh deleted file mode 100755 index 34e1f2520c422..0000000000000 --- a/ci/prep_ccache.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -if [ "$IRON_TOKEN" ]; then - - home_dir=$(pwd) - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - python ci/ironcache/get.py - ccache -C - - clear_cache=0 - if [ -f ~/ccache.7z ]; then - echo "Cache retrieved" - clear_cache=1 - cd $HOME - 7za e $HOME/ccache.7z - # ls -l $HOME - cd / - tar xvf $HOME/ccache - rm -rf $HOME/ccache.7z - rm -rf $HOME/ccache - - fi - - # did the last commit change cython files? - cd $home_dir - - retval=$(git diff HEAD~3 --numstat | grep -P "pyx|pxd"|wc -l) - echo "number of cython files changed: $retval" - - if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] - then - # nope, reuse cython files - echo "Will reuse cached cython file" - touch "$TRAVIS_BUILD_DIR"/pandas/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/src/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/*.cpp - else - echo "Rebuilding cythonized files" - fi -fi - -exit 0 diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh new file mode 100755 index 0000000000000..18d9388327ddc --- /dev/null +++ b/ci/prep_cython_cache.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +ls "$HOME/.cache/" + +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` +pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` + +CACHE_File="$HOME/.cache/cython_files.tar" + +# Clear the cython cache 0 = NO, 1 = YES +clear_cache=0 + +pyx_files=`echo "$pyx_file_list" | wc -l` +pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` + +if [[ pyx_files -ne pyx_cache_files ]] +then + echo "Different number of pyx files" + clear_cache=1 +fi + +home_dir=$(pwd) + +if [ -f "$CACHE_File" ] && [ -z "$NOCACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then + + echo "Cache available - checking pyx diff" + + for i in ${pyx_file_list} + do + diff=`diff -u $i $PYX_CACHE_DIR${i}` + if [[ $? -eq 2 ]] + then + echo "${i##*/} can't be diffed; probably not in cache" + clear_cache=1 + fi + if [[ ! -z $diff ]] + then + echo "${i##*/} has changed:" + echo $diff + clear_cache=1 + fi + done + + if [ "$TRAVIS_PULL_REQUEST" == "false" ] + then + echo "Not a PR" + # Uncomment next 2 lines to turn off cython caching not in a PR + # echo "Non PR cython caching is disabled" + # clear_cache=1 + else + echo "In a PR" + # Uncomment next 2 lines to turn off cython caching in a PR + # echo "PR cython caching is disabled" + # clear_cache=1 + fi + +fi + +if [ $clear_cache -eq 0 ] && [ -z "$NOCACHE" ] +then + # No and nocache is not set + echo "Will reuse cached cython file" + cd / + tar xvmf $CACHE_File + cd $home_dir +else + echo "Rebuilding cythonized files" + echo "No cache = $NOCACHE" + echo "Clear cache (1=YES) = $clear_cache" +fi + + +exit 0 diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 9fb05df64bcea..dd2180f6eeb19 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -30,20 +30,21 @@ def parse_results(filename): i += 1 assert i - 1 == len(skipped) assert i - 1 == len(skipped) - assert len(skipped) == int(root.attrib['skip']) + # assert len(skipped) == int(root.attrib['skip']) return '\n'.join(skipped) def main(args): print('SKIPPED TESTS:') - print(parse_results(args.filename)) + for fn in args.filename: + print(parse_results(fn)) return 0 def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('filename', help='XUnit file to parse') + parser.add_argument('filename', nargs='+', help='XUnit file to parse') return parser.parse_args() diff --git a/ci/requirements-2.6.build b/ci/requirements-2.6.build deleted file mode 100644 index 85148069a9e6a..0000000000000 --- a/ci/requirements-2.6.build +++ /dev/null @@ -1,4 +0,0 @@ -numpy=1.7.1 -cython=0.19.1 -dateutil=1.5 -pytz=2013b diff --git a/ci/requirements-2.6.pip b/ci/requirements-2.6.pip deleted file mode 100644 index db293045f69ec..0000000000000 --- a/ci/requirements-2.6.pip +++ /dev/null @@ -1,3 +0,0 @@ -blosc -openpyxl -argparse diff --git a/ci/requirements-2.6.run b/ci/requirements-2.6.run deleted file mode 100644 index 5f8a2fde1409f..0000000000000 --- a/ci/requirements-2.6.run +++ /dev/null @@ -1,16 +0,0 @@ -numpy=1.7.1 -dateutil=1.5 -pytz=2013b -scipy=0.11.0 -xlwt=0.7.5 -xlrd=0.9.2 -statsmodels=0.4.3 -bottleneck=0.8.0 -numexpr=2.2.2 -pytables=3.0.0 -html5lib=1.0b2 -beautiful-soup=4.2.0 -psycopg2=2.5.1 -pymysql=0.6.0 -sqlalchemy=0.7.8 -xlsxwriter=0.4.6 diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build deleted file mode 100644 index 6c9965ac0305e..0000000000000 --- a/ci/requirements-2.7.build +++ /dev/null @@ -1,4 +0,0 @@ -dateutil=2.1 -pytz=2013b -numpy=1.9.3 -cython=0.19.1 diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip deleted file mode 100644 index 644457d69b37f..0000000000000 --- a/ci/requirements-2.7.pip +++ /dev/null @@ -1,4 +0,0 @@ -blosc -httplib2 -google-api-python-client == 1.2 -python-gflags == 2.0 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run deleted file mode 100644 index 10049179912da..0000000000000 --- a/ci/requirements-2.7.run +++ /dev/null @@ -1,21 +0,0 @@ -dateutil=2.1 -pytz=2013b -numpy=1.9.3 -xlwt=0.7.5 -numexpr -pytables -matplotlib -openpyxl=1.6.2 -xlrd=0.9.2 -sqlalchemy=0.9.6 -lxml=3.2.1 -scipy -xlsxwriter=0.4.6 -boto=2.36.0 -bottleneck -psycopg2=2.5.2 -patsy -pymysql=0.6.3 -html5lib=1.0b2 -beautiful-soup=4.2.1 -statsmodels diff --git a/ci/requirements-2.7_BUILD_TEST.build b/ci/requirements-2.7_BUILD_TEST.build deleted file mode 100644 index faf1e3559f7f1..0000000000000 --- a/ci/requirements-2.7_BUILD_TEST.build +++ /dev/null @@ -1,4 +0,0 @@ -dateutil -pytz -numpy -cython diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build deleted file mode 100644 index ada6686f599ca..0000000000000 --- a/ci/requirements-2.7_LOCALE.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz=2013b -numpy=1.7.1 -cython=0.19.1 diff --git a/ci/requirements-2.7_LOCALE.pip b/ci/requirements-2.7_LOCALE.pip deleted file mode 100644 index cf8e6b8b3d3a6..0000000000000 --- a/ci/requirements-2.7_LOCALE.pip +++ /dev/null @@ -1 +0,0 @@ -blosc diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run deleted file mode 100644 index 9bb37ee10f8db..0000000000000 --- a/ci/requirements-2.7_LOCALE.run +++ /dev/null @@ -1,17 +0,0 @@ -python-dateutil -pytz=2013b -numpy=1.7.1 -xlwt=0.7.5 -openpyxl=1.6.2 -xlsxwriter=0.4.6 -xlrd=0.9.2 -bottleneck=0.8.0 -matplotlib=1.2.1 -patsy=0.1.0 -sqlalchemy=0.8.1 -html5lib=1.0b2 -lxml=3.2.1 -scipy=0.11.0 -beautiful-soup=4.2.1 -statsmodels=0.4.3 -bigquery=2.0.17 diff --git a/ci/requirements-2.7_NUMPY_DEV_master.build b/ci/requirements-2.7_NUMPY_DEV_master.build deleted file mode 100644 index 7d1d11daf9eeb..0000000000000 --- a/ci/requirements-2.7_NUMPY_DEV_master.build +++ /dev/null @@ -1,3 +0,0 @@ -python-dateutil -pytz -cython==0.19.1 diff --git a/ci/requirements-2.7_SLOW.build b/ci/requirements-2.7_SLOW.build deleted file mode 100644 index 664e8b418def7..0000000000000 --- a/ci/requirements-2.7_SLOW.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz -numpy=1.8.2 -cython diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run deleted file mode 100644 index f02a7cb8a309a..0000000000000 --- a/ci/requirements-2.7_SLOW.run +++ /dev/null @@ -1,21 +0,0 @@ -python-dateutil -pytz -numpy=1.8.2 -matplotlib=1.3.1 -scipy -patsy -statsmodels -xlwt -openpyxl -xlsxwriter -xlrd -numexpr -pytables -sqlalchemy -lxml -boto -bottleneck -psycopg2 -pymysql -html5lib -beautiful-soup diff --git a/ci/requirements-3.3.build b/ci/requirements-3.3.build deleted file mode 100644 index ada6686f599ca..0000000000000 --- a/ci/requirements-3.3.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz=2013b -numpy=1.7.1 -cython=0.19.1 diff --git a/ci/requirements-3.3.pip b/ci/requirements-3.3.pip deleted file mode 100644 index 7e172dc039087..0000000000000 --- a/ci/requirements-3.3.pip +++ /dev/null @@ -1,2 +0,0 @@ -blosc -openpyxl diff --git a/ci/requirements-3.3.run b/ci/requirements-3.3.run deleted file mode 100644 index 0256802a69eba..0000000000000 --- a/ci/requirements-3.3.run +++ /dev/null @@ -1,16 +0,0 @@ -python-dateutil -pytz=2013b -numpy=1.7.1 -xlsxwriter=0.4.6 -xlrd=0.9.2 -xlwt -html5lib=1.0b2 -numexpr -pytables -bottleneck=0.8.0 -matplotlib -patsy -lxml=3.2.1 -scipy -beautiful-soup=4.2.1 -statsmodels diff --git a/ci/requirements-3.4.build b/ci/requirements-3.4.build deleted file mode 100644 index 6fdffd41bd4c4..0000000000000 --- a/ci/requirements-3.4.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz -numpy=1.8.1 -cython diff --git a/ci/requirements-3.4.pip b/ci/requirements-3.4.pip deleted file mode 100644 index 47a049aac7632..0000000000000 --- a/ci/requirements-3.4.pip +++ /dev/null @@ -1,3 +0,0 @@ -blosc -httplib2 -google-api-python-client diff --git a/ci/requirements-3.4.run b/ci/requirements-3.4.run deleted file mode 100644 index 45d082022713e..0000000000000 --- a/ci/requirements-3.4.run +++ /dev/null @@ -1,18 +0,0 @@ -python-dateutil -pytz -numpy=1.8.1 -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy -beautiful-soup -scipy -numexpr -pytables -lxml -sqlalchemy -bottleneck -pymysql=0.6.3 -psycopg2 diff --git a/ci/requirements-3.4_SLOW.build b/ci/requirements-3.4_SLOW.build deleted file mode 100644 index de36b1afb9fa4..0000000000000 --- a/ci/requirements-3.4_SLOW.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz -numpy=1.9.3 -cython diff --git a/ci/requirements-3.4_SLOW.run b/ci/requirements-3.4_SLOW.run deleted file mode 100644 index 1eca130ecd96a..0000000000000 --- a/ci/requirements-3.4_SLOW.run +++ /dev/null @@ -1,20 +0,0 @@ -python-dateutil -pytz -numpy=1.9.3 -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy -beautiful-soup -scipy -numexpr -pytables -matplotlib -lxml -sqlalchemy -bottleneck -pymysql -psycopg2 -statsmodels diff --git a/ci/requirements-3.5.build b/ci/requirements-3.5.build deleted file mode 100644 index de36b1afb9fa4..0000000000000 --- a/ci/requirements-3.5.build +++ /dev/null @@ -1,4 +0,0 @@ -python-dateutil -pytz -numpy=1.9.3 -cython diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run deleted file mode 100644 index 91938675280d9..0000000000000 --- a/ci/requirements-3.5.run +++ /dev/null @@ -1,23 +0,0 @@ -python-dateutil -pytz -numpy=1.9.3 -openpyxl -xlsxwriter -xlrd -xlwt -patsy -scipy -numexpr -pytables -html5lib -lxml -matplotlib - -# currently causing some warnings -#sqlalchemy -#pymysql -#psycopg2 - -# not available from conda -#beautiful-soup -#bottleneck diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt new file mode 100644 index 0000000000000..9e4e8e99b5205 --- /dev/null +++ b/ci/requirements-optional-conda.txt @@ -0,0 +1,28 @@ +beautifulsoup4>=4.2.1 +blosc +bottleneck +fastparquet +feather-format +gcsfs +html5lib +ipython>=5.6.0 +ipykernel +jinja2 +lxml +matplotlib +nbsphinx +numexpr +openpyxl +pyarrow +pymysql +pytables +pytest-cov +pytest-xdist +s3fs +scipy +seaborn +sqlalchemy +xarray +xlrd +xlsxwriter +xlwt diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt new file mode 100644 index 0000000000000..3cce3f5339883 --- /dev/null +++ b/ci/requirements-optional-pip.txt @@ -0,0 +1,30 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directly +beautifulsoup4>=4.2.1 +blosc +bottleneck +fastparquet +feather-format +gcsfs +html5lib +ipython>=5.6.0 +ipykernel +jinja2 +lxml +matplotlib +nbsphinx +numexpr +openpyxl +pyarrow +pymysql +tables +pytest-cov +pytest-xdist +s3fs +scipy +seaborn +sqlalchemy +xarray +xlrd +xlsxwriter +xlwt \ No newline at end of file diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt deleted file mode 100644 index 6a0b695c5de87..0000000000000 --- a/ci/requirements_all.txt +++ /dev/null @@ -1,22 +0,0 @@ -nose -sphinx -ipython -python-dateutil -pytz -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy -beautiful-soup -numpy -cython -scipy -numexpr -pytables -matplotlib -lxml -sqlalchemy -bottleneck -pymysql diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index eac993f1cdf73..83ee30b52071d 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -1,5 +1,12 @@ -python-dateutil +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directly +Cython +NumPy +flake8 +moto +pytest>=3.1 +python-dateutil>=2.5.0 pytz -numpy -cython -nose +setuptools>=24.2.0 +sphinx +sphinxcontrib-spelling \ No newline at end of file diff --git a/ci/run_build_docs.sh b/ci/run_build_docs.sh index c04c815297aa3..2909b9619552e 100755 --- a/ci/run_build_docs.sh +++ b/ci/run_build_docs.sh @@ -2,7 +2,7 @@ echo "inside $0" -"$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & +"$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 # wait until subprocesses finish (build_docs.sh) wait diff --git a/ci/run_circle.sh b/ci/run_circle.sh new file mode 100755 index 0000000000000..435985bd42148 --- /dev/null +++ b/ci/run_circle.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +echo "[running tests]" +export PATH="$MINICONDA_DIR/bin:$PATH" + +source activate pandas + +echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" +pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/run_with_env.cmd b/ci/run_with_env.cmd index 3a472bc836c30..848f4608c8627 100644 --- a/ci/run_with_env.cmd +++ b/ci/run_with_env.cmd @@ -1,3 +1,7 @@ +:: EXPECTED ENV VARS: PYTHON_ARCH (either x86 or x64) +:: CONDA_PY (either 27, 33, 35 etc. - only major version is extracted) +:: +:: :: To build extensions for 64 bit Python 3, we need to configure environment :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) @@ -6,7 +10,8 @@ :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) :: -:: 32 bit builds do not require specific environment configurations. +:: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific +:: environment configurations. :: :: Note: this script needs to be run with the /E:ON and /V:ON flags for the :: cmd interpreter, at least for (SDK v7.0) @@ -15,33 +20,76 @@ :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows :: http://stackoverflow.com/a/13751649/163740 :: -:: Author: Olivier Grisel +:: Author: Phil Elson +:: Original Author: Olivier Grisel (https://github.com/ogrisel/python-appveyor-demo) :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ +:: +:: Notes about batch files for Python people: +:: +:: Quotes in values are literally part of the values: +:: SET FOO="bar" +:: FOO is now five characters long: " b a r " +:: If you don't want quotes, don't include them on the right-hand side. +:: +:: The CALL lines at the end of this file look redundant, but if you move them +:: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y +:: case, I don't know why. +:: originally from https://github.com/pelson/Obvious-CI/blob/master/scripts/obvci_appveyor_python_build_env.cmd @ECHO OFF SET COMMAND_TO_RUN=%* SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows -SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" -IF %MAJOR_PYTHON_VERSION% == "2" ( +:: Extract the major and minor versions, and allow for the minor version to be +:: more than 9. This requires the version number to have two dots in it. +SET MAJOR_PYTHON_VERSION=%CONDA_PY:~0,1% + +IF "%CONDA_PY:~2,1%" == "" ( + :: CONDA_PY style, such as 27, 34 etc. + SET MINOR_PYTHON_VERSION=%CONDA_PY:~1,1% +) ELSE ( + IF "%CONDA_PY:~3,1%" == "." ( + SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,1% + ) ELSE ( + SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,2% + ) +) + +:: Based on the Python version, determine what SDK version to use, and whether +:: to set the SDK for 64-bit. +IF %MAJOR_PYTHON_VERSION% == 2 ( SET WINDOWS_SDK_VERSION="v7.0" -) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( - SET WINDOWS_SDK_VERSION="v7.1" + SET SET_SDK_64=Y ) ELSE ( - ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" - EXIT 1 + IF %MAJOR_PYTHON_VERSION% == 3 ( + SET WINDOWS_SDK_VERSION="v7.1" + IF %MINOR_PYTHON_VERSION% LEQ 4 ( + SET SET_SDK_64=Y + ) ELSE ( + SET SET_SDK_64=N + ) + ) ELSE ( + ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" + EXIT /B 1 + ) ) IF "%PYTHON_ARCH%"=="64" ( - ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture - SET DISTUTILS_USE_SDK=1 - SET MSSdk=1 - "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% - "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release - ECHO Executing: %COMMAND_TO_RUN% - call %COMMAND_TO_RUN% || EXIT 1 + IF %SET_SDK_64% == Y ( + ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture + SET DISTUTILS_USE_SDK=1 + SET MSSdk=1 + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT /B 1 + ) ELSE ( + ECHO Using default MSVC build environment for 64 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT /B 1 + ) ) ELSE ( ECHO Using default MSVC build environment for 32 bit architecture ECHO Executing: %COMMAND_TO_RUN% - call %COMMAND_TO_RUN% || EXIT 1 + call %COMMAND_TO_RUN% || EXIT /B 1 ) diff --git a/ci/script.sh b/ci/script.sh deleted file mode 100755 index 1126e8249646c..0000000000000 --- a/ci/script.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -echo "inside $0" - -source activate pandas - -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -if [ "$BUILD_TEST" ]; then - echo "We are not running nosetests as this is simply a build test." -else - echo nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml - nosetests --exe -A "$NOSE_ARGS" pandas --doctest-tests --with-xunit --xunit-file=/tmp/nosetests.xml -fi - -RET="$?" - -exit "$RET" diff --git a/ci/script_multi.sh b/ci/script_multi.sh new file mode 100755 index 0000000000000..2b2d4d5488b91 --- /dev/null +++ b/ci/script_multi.sh @@ -0,0 +1,46 @@ +#!/bin/bash -e + +echo "[script multi]" + +source activate pandas + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE"; + echo "Setting LC_ALL to $LOCALE_OVERRIDE" + + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' + python -c "$pycmd" +fi + +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +echo PYTHONHASHSEED=$PYTHONHASHSEED + +if [ "$DOC" ]; then + echo "We are not running pytest as this is a doc-build" + +elif [ "$COVERAGE" ]; then + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + +elif [ "$SLOW" ]; then + TEST_ARGS="--only-slow --skip-network" + echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + +else + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest + +fi + +RET="$?" + +exit "$RET" diff --git a/ci/script_single.sh b/ci/script_single.sh new file mode 100755 index 0000000000000..60e2fbb33ee5d --- /dev/null +++ b/ci/script_single.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +echo "[script_single]" + +source activate pandas + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE"; + echo "Setting LC_ALL to $LOCALE_OVERRIDE" + + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' + python -c "$pycmd" +fi + +if [ "$SLOW" ]; then + TEST_ARGS="--only-slow --skip-network" +fi + +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + +if [ "$DOC" ]; then + echo "We are not running pytest as this is a doc-build" + +elif [ "$COVERAGE" ]; then + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + +else + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + +fi + +RET="$?" + +exit "$RET" diff --git a/ci/show_circle.sh b/ci/show_circle.sh new file mode 100755 index 0000000000000..bfaa65c1d84f2 --- /dev/null +++ b/ci/show_circle.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +echo "[installed versions]" + +export PATH="$MINICONDA_DIR/bin:$PATH" +source activate pandas + +python -c "import pandas; pandas.show_versions();" diff --git a/ci/speedpack/Vagrantfile b/ci/speedpack/Vagrantfile deleted file mode 100644 index ec939b7c0a937..0000000000000 --- a/ci/speedpack/Vagrantfile +++ /dev/null @@ -1,22 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : -Vagrant.configure("2") do |config| - config.vm.box = "precise64" - config.vm.box_url = "http://files.vagrantup.com/precise64.box" - -# config.vbguest.auto_update = true -# config.vbguest.no_remote = true - - config.vm.synced_folder File.expand_path("..", Dir.pwd), "/reqf" - config.vm.synced_folder "wheelhouse", "/wheelhouse" - - config.vm.provider :virtualbox do |vb| - vb.customize ["modifyvm", :id, "--cpus", "4"] - vb.customize ["modifyvm", :id, "--memory", "2048"] - vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] - vb.customize ["modifyvm", :id, "--natdnsproxy1", "on"] - end - - config.vm.provision :shell, :path => "build.sh" - -end diff --git a/ci/speedpack/build.sh b/ci/speedpack/build.sh deleted file mode 100755 index 330d8984ea7b7..0000000000000 --- a/ci/speedpack/build.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# This script is meant to run on a mint precise64 VM. -# The generated wheel files should be compatible -# with travis-ci as of 07/2013. -# -# Runtime can be up to an hour or more. - -echo "Building wheels..." - -# print a trace for everything; RTFM -set -x - -# install and update some basics -apt-get update -apt-get install python-software-properties git -y -apt-add-repository ppa:fkrull/deadsnakes -y -apt-get update - -# install some deps and virtualenv -apt-get install python-pip libfreetype6-dev libpng12-dev libhdf5-serial-dev \ - g++ libatlas-base-dev gfortran libreadline-dev zlib1g-dev flex bison \ - libxml2-dev libxslt-dev libssl-dev -y -pip install virtualenv -apt-get build-dep python-lxml -y - -# install sql servers -apt-get install postgresql-client libpq-dev -y - -export PYTHONIOENCODING='utf-8' -export VIRTUALENV_DISTRIBUTE=0 - -function create_fake_pandas() { - local site_pkg_dir="$1" - rm -rf $site_pkg_dir/pandas - mkdir $site_pkg_dir/pandas - touch $site_pkg_dir/pandas/__init__.py - echo "version = '0.10.0-phony'" > $site_pkg_dir/pandas/version.py -} - - -function get_site_pkgs_dir() { - python$1 -c 'import distutils; print(distutils.sysconfig.get_python_lib())' -} - - -function create_wheel() { - local pip_args="$1" - local wheelhouse="$2" - local n="$3" - local pyver="$4" - - local site_pkgs_dir="$(get_site_pkgs_dir $pyver)" - - - if [[ "$n" == *statsmodels* ]]; then - create_fake_pandas $site_pkgs_dir && \ - pip wheel $pip_args --wheel-dir=$wheelhouse $n && \ - pip install $pip_args --no-index $n && \ - rm -Rf $site_pkgs_dir - else - pip wheel $pip_args --wheel-dir=$wheelhouse $n - pip install $pip_args --no-index $n - fi -} - - -function generate_wheels() { - # get the requirements file - local reqfile="$1" - - # get the python version - local TAG=$(echo $reqfile | grep -Po "(\d\.?[\d\-](_\w+)?)") - - # base dir for wheel dirs - local WHEELSTREET=/wheelhouse - local WHEELHOUSE="$WHEELSTREET/$TAG" - - local PY_VER="${TAG:0:3}" - local PY_MAJOR="${PY_VER:0:1}" - local PIP_ARGS="--use-wheel --find-links=$WHEELHOUSE --download-cache /tmp" - - # install the python version if not installed - apt-get install python$PY_VER python$PY_VER-dev -y - - # create a new virtualenv - rm -Rf /tmp/venv - virtualenv -p python$PY_VER /tmp/venv - source /tmp/venv/bin/activate - - # install pip setuptools - pip install -I --download-cache /tmp 'git+https://github.com/pypa/pip@42102e9d#egg=pip' - pip install -I -U --download-cache /tmp setuptools - pip install -I --download-cache /tmp wheel - - # make the dir if it doesn't exist - mkdir -p $WHEELHOUSE - - # put the requirements file in the wheelhouse - cp $reqfile $WHEELHOUSE - - # install and build the wheels - cat $reqfile | while read N; do - create_wheel "$PIP_ARGS" "$WHEELHOUSE" "$N" "$PY_VER" - done -} - - -# generate a single wheel version -# generate_wheels "/reqf/requirements-2.7.txt" -# -# if vagrant is already up -# run as vagrant provision - -for reqfile in $(ls -1 /reqf/requirements-*.*); do - generate_wheels "$reqfile" -done diff --git a/ci/speedpack/nginx/nginx.conf.template b/ci/speedpack/nginx/nginx.conf.template deleted file mode 100644 index e2cfeaf053d08..0000000000000 --- a/ci/speedpack/nginx/nginx.conf.template +++ /dev/null @@ -1,48 +0,0 @@ -#user nobody; -worker_processes 1; - -#error_log logs/error.log; -#error_log logs/error.log notice; -#error_log logs/error.log info; - -#pid logs/nginx.pid; - - -events { - worker_connections 1024; -} - - -http { - include mime.types; - default_type application/octet-stream; - - #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' - # '$status $body_bytes_sent "$http_referer" ' - # '"$http_user_agent" "$http_x_forwarded_for"'; - - #access_log logs/access.log on; - - sendfile on; - #tcp_nopush on; - - #keepalive_timeout 0; - keepalive_timeout 65; - - #gzip on; - - server { - listen $OPENSHIFT_IP:$OPENSHIFT_PORT; - - access_log access.log ; - sendfile on; - - location / { - root ../../app-root/data/store/; - autoindex on; - } - - - } - -} diff --git a/ci/submit_ccache.sh b/ci/submit_ccache.sh deleted file mode 100755 index da421489230dd..0000000000000 --- a/ci/submit_ccache.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -home_dir=$(pwd) -ccache -s - -MISSES=$(ccache -s | grep "cache miss" | grep -Po "\d+") -echo "MISSES: $MISSES" - -if [ x"$MISSES" == x"0" ]; then - echo "No cache misses detected, skipping upload" - exit 0 -fi - -if [ "$IRON_TOKEN" ]; then - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - rm -rf $HOME/ccache.7z - - tar cf - $HOME/.ccache \ - "$TRAVIS_BUILD_DIR"/pandas/{index,algos,lib,tslib,parser,hashtable}.c \ - "$TRAVIS_BUILD_DIR"/pandas/src/{sparse,testing}.c \ - "$TRAVIS_BUILD_DIR"/pandas/msgpack.cpp \ - | 7za a -si $HOME/ccache.7z - - split -b 500000 -d $HOME/ccache.7z $HOME/ccache. - - python ci/ironcache/put.py -fi; - -exit 0 diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh new file mode 100755 index 0000000000000..b87acef0ba11c --- /dev/null +++ b/ci/submit_cython_cache.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +CACHE_File="$HOME/.cache/cython_files.tar" +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` + +rm -rf $CACHE_File +rm -rf $PYX_CACHE_DIR + +home_dir=$(pwd) + +mkdir -p $PYX_CACHE_DIR +rsync -Rv $pyx_file_list $PYX_CACHE_DIR + +echo "pyx files:" +echo $pyx_file_list + +tar cf ${CACHE_File} --files-from /dev/null + +for i in ${pyx_file_list} +do + f=${i%.pyx} + ls $f.{c,cpp} | tar rf ${CACHE_File} -T - +done + +echo "Cython files in cache tar:" +tar tvf ${CACHE_File} + +exit 0 diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml new file mode 100644 index 0000000000000..1312c1296d46a --- /dev/null +++ b/ci/travis-27-locale.yaml @@ -0,0 +1,27 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - bottleneck=1.0.0 + - cython=0.24 + - lxml + - matplotlib=1.4.3 + - numpy=1.9.2 + - openpyxl=2.4.0 + - python-dateutil + - python-blosc + - python=2.7 + - pytz + - pytz=2013b + - scipy + - sqlalchemy=0.8.1 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - pip: + - html5lib==1.0b2 + - beautifulsoup4==4.2.1 diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml new file mode 100644 index 0000000000000..482b888b88062 --- /dev/null +++ b/ci/travis-27.yaml @@ -0,0 +1,50 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - cython=0.24 + - fastparquet + - feather-format + - flake8=3.4.1 + - gcsfs + - html5lib + - ipython + - jemalloc=4.5.0.post + - jinja2=2.8 + - lxml + - matplotlib + - mock + - nomkl + - numexpr + - numpy=1.13* + - openpyxl=2.4.0 + - patsy + - psycopg2 + - py + - pyarrow=0.4.1 + - PyCrypto + - pymysql=0.6.3 + - pytables + - python-blosc + - python-dateutil=2.5.0 + - python=2.7* + - pytz=2013b + - s3fs + - scipy + - sqlalchemy=0.9.6 + - xarray=0.8.0 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - moto + - pip: + - backports.lzma + - cpplint + - pandas-gbq + - pathlib diff --git a/ci/travis-35-osx.yaml b/ci/travis-35-osx.yaml new file mode 100644 index 0000000000000..e74abac4c9775 --- /dev/null +++ b/ci/travis-35-osx.yaml @@ -0,0 +1,27 @@ +name: pandas +channels: + - defaults +dependencies: + - beautifulsoup4 + - bottleneck + - cython + - html5lib + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy=1.10.4 + - openpyxl + - pytables + - python=3.5* + - pytz + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - pip: + - python-dateutil==2.5.3 diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml new file mode 100644 index 0000000000000..c22dddbe0ba3f --- /dev/null +++ b/ci/travis-36-doc.yaml @@ -0,0 +1,45 @@ +name: pandas +channels: + - defaults + - conda-forge + - r +dependencies: + - beautifulsoup4 + - bottleneck + - cython + - fastparquet + - feather-format + - html5lib + - ipykernel + - ipython + - ipywidgets + - lxml + - matplotlib + - nbconvert + - nbformat + - nbsphinx + - notebook + - numexpr + - numpy=1.13* + - openpyxl + - pandoc + - pyqt + - pytables + - python-dateutil + - python-snappy + - python=3.6* + - pytz + - r + - rpy2 + - scipy + - seaborn + - sphinx + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist diff --git a/ci/travis-36-numpydev.yaml b/ci/travis-36-numpydev.yaml new file mode 100644 index 0000000000000..455d65feb4242 --- /dev/null +++ b/ci/travis-36-numpydev.yaml @@ -0,0 +1,16 @@ +name: pandas +channels: + - defaults +dependencies: + - python=3.6* + - pytz + - Cython + # universal + - pytest + - pytest-xdist + - pip: + - "git+git://github.com/dateutil/dateutil.git" + - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" + - "--pre" + - "numpy" + - "scipy" diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml new file mode 100644 index 0000000000000..6c475dc48723c --- /dev/null +++ b/ci/travis-36-slow.yaml @@ -0,0 +1,30 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython + - html5lib + - lxml + - matplotlib + - numexpr + - numpy + - openpyxl + - patsy + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml new file mode 100644 index 0000000000000..ff4f1a4a86f99 --- /dev/null +++ b/ci/travis-36.yaml @@ -0,0 +1,48 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython + - dask + - fastparquet + - feather-format + - gcsfs + - geopandas + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl + - psycopg2 + - pyarrow + - pymysql + - pytables + - python-snappy + - python=3.6* + - pytz + - s3fs + - scikit-learn + - scipy + - seaborn + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - pytest-cov + - moto + - pip: + - brotlipy + - coverage + - pandas-datareader + - python-dateutil diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 0000000000000..8b255c9e6ec72 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,14 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh new file mode 100755 index 0000000000000..e404ca73a405e --- /dev/null +++ b/ci/travis_encrypt_gbq.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +GBQ_JSON_FILE=$1 + +if [[ $# -ne 1 ]]; then + echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ + "" + exit 1 +fi + +if [[ $GBQ_JSON_FILE != *.json ]]; then + echo "ERROR: Expected *.json file" + exit 1 +fi + +if [[ ! -f $GBQ_JSON_FILE ]]; then + echo "ERROR: File $GBQ_JSON_FILE does not exist" + exit 1 +fi + +echo "Encrypting $GBQ_JSON_FILE..." +read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ +travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); + +echo "Adding your secure key to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ +> travis_gbq_config.txt + +echo "Done. Removing file $GBQ_JSON_FILE" +rm $GBQ_JSON_FILE + +echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ + "NOTE: Do NOT commit the *.json file containing your unencrypted" \ + "private key" diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc new file mode 100644 index 0000000000000..c2a33bbd6f263 Binary files /dev/null and b/ci/travis_gbq.json.enc differ diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt new file mode 100644 index 0000000000000..0b28cdedbd0d7 --- /dev/null +++ b/ci/travis_gbq_config.txt @@ -0,0 +1,2 @@ +TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv +TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh new file mode 100755 index 0000000000000..9967d40e49f0a --- /dev/null +++ b/ci/travis_process_gbq_encryption.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +source ci/travis_gbq_config.txt + +if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then + echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; +elif [[ -n ${!TRAVIS_IV_ENV} ]]; then + openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ + -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; + export GBQ_PROJECT_ID='pandas-travis'; + echo 'Successfully decrypted gbq credentials' +fi + diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh new file mode 100755 index 0000000000000..a7ef2fa908079 --- /dev/null +++ b/ci/upload_coverage.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +if [ -z "$COVERAGE" ]; then + echo "coverage is not selected for this build" + exit 0 +fi + +source activate pandas + +echo "uploading coverage" +bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml +bash <(curl -s https://codecov.io/bash) -Z -c -F multiple -f /tmp/cov-multiple.xml diff --git a/circle.yml b/circle.yml new file mode 100644 index 0000000000000..66415defba6fe --- /dev/null +++ b/circle.yml @@ -0,0 +1,38 @@ +machine: + environment: + # these are globally set + MINICONDA_DIR: /home/ubuntu/miniconda3 + + +database: + override: + - ./ci/install_db_circle.sh + + +checkout: + post: + # since circleci does a shallow fetch + # we need to populate our tags + - git fetch --depth=1000 + + +dependencies: + override: + - > + case $CIRCLE_NODE_INDEX in + 0) + sudo apt-get install language-pack-it && ./ci/install_circle.sh JOB="2.7_COMPAT" ENV_FILE="ci/circle-27-compat.yaml" LOCALE_OVERRIDE="it_IT.UTF-8" ;; + 1) + sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE" ENV_FILE="ci/circle-36-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; + 2) + sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE_SLOW" ENV_FILE="ci/circle-36-locale_slow.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; + 3) + ./ci/install_circle.sh JOB="3.5_ASCII" ENV_FILE="ci/circle-35-ascii.yaml" LOCALE_OVERRIDE="C" ;; + esac + - ./ci/show_circle.sh + + +test: + override: + - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: + parallel: true diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000000..512bc2e82a736 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,13 @@ +codecov: + branch: master + +coverage: + status: + project: + default: + enabled: no + target: '82' + patch: + default: + enabled: no + target: '50' diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index e3495bc5bd04a..86bed996c8aab 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,9 +1,11 @@ package: name: pandas - version: {{ environ.get('GIT_DESCRIBE_TAG', '').replace('.dev', 'dev') }} + version: {{ environ.get('GIT_DESCRIBE_TAG','').replace('v', '', 1) }} build: number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }} + {% if GIT_DESCRIBE_NUMBER|int == 0 %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_0 + {% else %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_{{ GIT_BUILD_STR }}{% endif %} source: git_url: ../ @@ -12,28 +14,21 @@ requirements: build: - python - cython - - numpy - - libpython # [py2k and win] - - setuptools + - numpy 1.11.* + - setuptools >=3.3 + - python-dateutil >=2.5.0 - pytz - - python-dateutil run: - python - - numpy - - python-dateutil + - numpy >=1.11.* + - python-dateutil >=2.5.0 - pytz test: imports: - pandas - #requires: - # - nose - - #commands: - # - nosetests --exe -A "not slow and not network and not disabled" pandas - about: home: http://pandas.pydata.org license: BSD diff --git a/doc/README.rst b/doc/README.rst index 06d95e6b9c44d..12950d323f5d3 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -3,9 +3,11 @@ Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still -of huge value. You don't even have to be an expert on -*pandas* to do so! Something as simple as rewriting small passages for clarity +Whether you are someone who loves writing, teaching, or development, +contributing to the documentation is a huge value. If you don't see yourself +as a developer type, please don't stress and know that we want you to +contribute. You don't even have to be an expert on *pandas* to do so! +Something as simple as rewriting small passages for clarity as you reference the docs is a simple but effective way to contribute. The next person to read that passage will be in your debt! @@ -40,7 +42,7 @@ Some other important things to know about the docs: - The docstrings follow the **Numpy Docstring Standard** which is used widely in the Scientific Python community. This standard specifies the format of the different sections of the docstring. See `this document - `_ + `_ for a detailed explanation, or look at some of the existing functions to extend it in a similar manner. @@ -81,7 +83,9 @@ have ``sphinx`` and ``ipython`` installed. `numpydoc `_ is used to parse the docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of ``numpydoc`` is included in the pandas source -code. +code. `nbsphinx `_ is used to convert +Jupyter notebooks. You will need to install it if you intend to modify any of +the notebooks included in the documentation. Furthermore, it is recommended to have all `optional dependencies `_ @@ -155,12 +159,12 @@ Where to start? --------------- There are a number of issues listed under `Docs -`_ -and `Good as first PR -`_ +`_ +and `good first issue +`_ where you could start out. -Or maybe you have an idea of you own, by using pandas, looking for something +Or maybe you have an idea of your own, by using pandas, looking for something in the documentation and thinking 'this can be improved', let's do something about that! diff --git a/doc/_templates/api_redirect.html b/doc/_templates/api_redirect.html index 24bdd8363830f..c04a8b58ce544 100644 --- a/doc/_templates/api_redirect.html +++ b/doc/_templates/api_redirect.html @@ -1,15 +1,10 @@ -{% set pgn = pagename.split('.') -%} -{% if pgn[-2][0].isupper() -%} - {% set redirect = ["pandas", pgn[-2], pgn[-1], 'html']|join('.') -%} -{% else -%} - {% set redirect = ["pandas", pgn[-1], 'html']|join('.') -%} -{% endif -%} +{% set redirect = redirects[pagename.split("/")[-1]] %} - + This API page has moved -

This API page has moved here.

+

This API page has moved here.

- \ No newline at end of file + diff --git a/doc/_templates/autosummary/accessor.rst b/doc/_templates/autosummary/accessor.rst index 1401121fb51c6..4ba745cd6fdba 100644 --- a/doc/_templates/autosummary/accessor.rst +++ b/doc/_templates/autosummary/accessor.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. automethod:: {{ [module.split('.')[1], objname]|join('.') }} +.. autoaccessor:: {{ (module.split('.')[1:] + [objname]) | join('.') }} diff --git a/doc/_templates/autosummary/accessor_attribute.rst b/doc/_templates/autosummary/accessor_attribute.rst index e38a9f22f9d99..b5ad65d6a736f 100644 --- a/doc/_templates/autosummary/accessor_attribute.rst +++ b/doc/_templates/autosummary/accessor_attribute.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. autoaccessorattribute:: {{ [module.split('.')[1], objname]|join('.') }} \ No newline at end of file +.. autoaccessorattribute:: {{ (module.split('.')[1:] + [objname]) | join('.') }} diff --git a/doc/_templates/autosummary/accessor_callable.rst b/doc/_templates/autosummary/accessor_callable.rst index 6f45e0fd01e16..7a3301814f5f4 100644 --- a/doc/_templates/autosummary/accessor_callable.rst +++ b/doc/_templates/autosummary/accessor_callable.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. autoaccessorcallable:: {{ [module.split('.')[1], objname]|join('.') }}.__call__ +.. autoaccessorcallable:: {{ (module.split('.')[1:] + [objname]) | join('.') }}.__call__ diff --git a/doc/_templates/autosummary/accessor_method.rst b/doc/_templates/autosummary/accessor_method.rst index 8175d8615ceb2..aefbba6ef1bbc 100644 --- a/doc/_templates/autosummary/accessor_method.rst +++ b/doc/_templates/autosummary/accessor_method.rst @@ -3,4 +3,4 @@ .. currentmodule:: {{ module.split('.')[0] }} -.. autoaccessormethod:: {{ [module.split('.')[1], objname]|join('.') }} \ No newline at end of file +.. autoaccessormethod:: {{ (module.split('.')[1:] + [objname]) | join('.') }} diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf new file mode 100644 index 0000000000000..696ed288cf7a6 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx new file mode 100644 index 0000000000000..f8b98a6f1f8e4 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt new file mode 100644 index 0000000000000..d32fe5bcd05a6 --- /dev/null +++ b/doc/cheatsheet/README.txt @@ -0,0 +1,8 @@ +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF' as the format. + +This cheat sheet was inspired by the RstudioData Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. + +[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf +[2]: http://www.princetonoptimization.com/ diff --git a/doc/logo/pandas_logo.png b/doc/logo/pandas_logo.png new file mode 100644 index 0000000000000..065ee4e4856a6 Binary files /dev/null and b/doc/logo/pandas_logo.png differ diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py new file mode 100644 index 0000000000000..c3647f0c7d2a8 --- /dev/null +++ b/doc/logo/pandas_logo.py @@ -0,0 +1,44 @@ +# script to generate the pandas logo + +from matplotlib import pyplot as plt +from matplotlib import rcParams +import numpy as np + +rcParams['mathtext.fontset'] = 'cm' + + +def fnx(): + return np.random.randint(5, 50, 10) + + +fig = plt.figure(figsize=(6, 1.25)) + +ax = fig.add_axes((0.45, 0.1, 0.16, 0.8)) +bar_data = [2.1, -00.8, 1.1, 2.5, -2.1, -0.5, -2.0, 1.5] +ax.set_ylim(-3, 3) +ax.set_xticks([]) +ax.set_yticks([]) +ax.bar(np.arange(len(bar_data)), bar_data) + +ax = fig.add_axes((0.63, 0.1, 0.16, 0.8)) +for i in range(4): + ax.plot(np.random.rand(8)) +ax.set_xticks([]) +ax.set_yticks([]) + +ax = fig.add_axes((0.63 + 0.18, 0.1, 0.16, 0.8)) +y = np.row_stack((fnx(), fnx(), fnx())) +x = np.arange(10) +y1, y2, y3 = fnx(), fnx(), fnx() +ax.stackplot(x, y1, y2, y3) +ax.set_xticks([]) +ax.set_yticks([]) + +plt.figtext(0.05, 0.5, "pandas", size=40) + +plt.figtext( + 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$", + size=16, color="#5a89a4") + +fig.savefig('pandas_logo.svg') +fig.savefig('pandas_logo.png') diff --git a/doc/logo/pandas_logo.svg b/doc/logo/pandas_logo.svg new file mode 100644 index 0000000000000..b165f9635bf51 --- /dev/null +++ b/doc/logo/pandas_logo.svg @@ -0,0 +1,879 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/make.py b/doc/make.py index 6b424ce2814d5..4d54a2415a194 100755 --- a/doc/make.py +++ b/doc/make.py @@ -1,381 +1,372 @@ #!/usr/bin/env python - """ Python script for building documentation. To build the docs you must have all optional dependencies for pandas installed. See the installation instructions for a list of these. -Note: currently latex builds do not work because of table formats that are not -supported in the latex generation. - -2014-01-30: Latex has some issues but 'latex_forced' works ok for 0.13.0-400 or so - Usage ----- -python make.py clean -python make.py html + $ python make.py clean + $ python make.py html + $ python make.py latex """ -from __future__ import print_function - -import glob +import importlib +import sys import os import shutil -import sys -import sphinx +# import subprocess import argparse +from contextlib import contextmanager +import webbrowser import jinja2 -os.environ['PYTHONPATH'] = '..' - -SPHINX_BUILD = 'sphinxbuild' - - -def upload_dev(user='pandas'): - 'push a copy to the pydata dev directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): - raise SystemExit('Upload to Pydata Dev failed') - - -def upload_dev_pdf(user='pandas'): - 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): - raise SystemExit('PDF upload to Pydata Dev failed') - -def upload_stable(user='pandas'): - 'push a copy to the pydata stable directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): - raise SystemExit('Upload to stable failed') +DOC_PATH = os.path.dirname(os.path.abspath(__file__)) +SOURCE_PATH = os.path.join(DOC_PATH, 'source') +BUILD_PATH = os.path.join(DOC_PATH, 'build') +BUILD_DIRS = ['doctrees', 'html', 'latex', 'plots', '_static', '_templates'] -def upload_stable_pdf(user='pandas'): - 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): - raise SystemExit('PDF upload to stable failed') +@contextmanager +def _maybe_exclude_notebooks(): + """Skip building the notebooks if pandoc is not installed. + This assumes that nbsphinx is installed. -def upload_prev(ver, doc_root='./', user='pandas'): - 'push a copy of older release to appropriate version directory' - local_dir = doc_root + 'build/html' - remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . %s@pandas.pydata.org:%s -essh' - cmd = cmd % (local_dir, user, remote_dir) - print(cmd) - if os.system(cmd): - raise SystemExit( - 'Upload to %s from %s failed' % (remote_dir, local_dir)) + Skip notebook conversion if: + 1. nbconvert isn't installed, or + 2. nbconvert is installed, but pandoc isn't + """ + # TODO move to exclude_pattern + base = os.path.dirname(__file__) + notebooks = [os.path.join(base, 'source', nb) + for nb in ['style.ipynb']] + contents = {} - local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf %s@pandas.pydata.org:%s' - pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) - if os.system(pdf_cmd): - raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) + def _remove_notebooks(): + for nb in notebooks: + with open(nb, 'rt') as f: + contents[nb] = f.read() + os.remove(nb) -def build_pandas(): - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - -def build_prev(ver): - if os.system('git checkout v%s' % ver) != 1: - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - os.system('python make.py clean') - os.system('python make.py html') - os.system('python make.py latex') - os.system('git checkout master') - - -def clean(): - if os.path.exists('build'): - shutil.rmtree('build') - - if os.path.exists('source/generated'): - shutil.rmtree('source/generated') - - -def html(): - check_build() - if os.system('sphinx-build -P -b html -d build/doctrees ' - 'source build/html'): - raise SystemExit("Building HTML failed.") try: - # remove stale file - os.system('cd build; rm -f html/pandas.zip;') - except: - pass - -def zip_html(): - try: - print("\nZipping up HTML docs...") - # just in case the wonky build box doesn't have zip - # don't fail this. - os.system('cd build; rm -f html/pandas.zip; zip html/pandas.zip -r -q html/* ') - print("\n") - except: - pass - -def latex(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Call the makefile produced by sphinx... - if os.system('make'): - print("Rendering LaTeX failed.") - print("You may still be able to get a usable PDF file by going into 'build/latex'") - print("and executing 'pdflatex pandas.tex' for the requisite number of passes.") - print("Or using the 'latex_forced' target") - raise SystemExit - - os.chdir('../..') - else: - print('latex build has not been tested on windows') - -def latex_forced(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Manually call pdflatex, 3 passes should ensure latex fixes up - # all the required cross-references and such. - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - raise SystemExit("You should check the file 'build/latex/pandas.pdf' for problems.") - - os.chdir('../..') + import nbconvert + except ImportError: + sys.stderr.write('Warning: nbconvert not installed. ' + 'Skipping notebooks.\n') + _remove_notebooks() else: - print('latex build has not been tested on windows') - + try: + nbconvert.utils.pandoc.get_pandoc_version() + except nbconvert.utils.pandoc.PandocMissing: + sys.stderr.write('Warning: Pandoc is not installed. ' + 'Skipping notebooks.\n') + _remove_notebooks() + + yield + + for nb, content in contents.items(): + with open(nb, 'wt') as f: + f.write(content) + + +class DocBuilder: + """Class to wrap the different commands of this script. + + All public methods of this class can be called as parameters of the + script. + """ + def __init__(self, num_jobs=1, include_api=True, single_doc=None, + verbosity=0): + self.num_jobs = num_jobs + self.include_api = include_api + self.verbosity = verbosity + self.single_doc = None + self.single_doc_type = None + if single_doc is not None: + self._process_single_doc(single_doc) + self.exclude_patterns = self._exclude_patterns + + self._generate_index() + if self.single_doc_type == 'docstring': + self._run_os('sphinx-autogen', '-o', + 'source/generated_single', 'source/index.rst') + + @property + def _exclude_patterns(self): + """Docs source files that will be excluded from building.""" + # TODO move maybe_exclude_notebooks here + if self.single_doc is not None: + rst_files = [f for f in os.listdir(SOURCE_PATH) + if ((f.endswith('.rst') or f.endswith('.ipynb')) + and (f != 'index.rst') + and (f != '{0}.rst'.format(self.single_doc)))] + if self.single_doc_type != 'api': + rst_files += ['generated/*.rst'] + elif not self.include_api: + rst_files = ['api.rst', 'generated/*.rst'] + else: + rst_files = ['generated_single/*.rst'] + + exclude_patterns = ','.join( + '{!r}'.format(i) for i in ['**.ipynb_checkpoints'] + rst_files) + + return exclude_patterns + + def _process_single_doc(self, single_doc): + """Extract self.single_doc (base name) and self.single_doc_type from + passed single_doc kwarg. + + """ + self.include_api = False + + if single_doc == 'api.rst' or single_doc == 'api': + self.single_doc_type = 'api' + self.single_doc = 'api' + elif os.path.exists(os.path.join(SOURCE_PATH, single_doc)): + self.single_doc_type = 'rst' + self.single_doc = os.path.splitext(os.path.basename(single_doc))[0] + elif os.path.exists( + os.path.join(SOURCE_PATH, '{}.rst'.format(single_doc))): + self.single_doc_type = 'rst' + self.single_doc = single_doc + elif single_doc is not None: + try: + obj = pandas # noqa: F821 + for name in single_doc.split('.'): + obj = getattr(obj, name) + except AttributeError: + raise ValueError('Single document not understood, it should ' + 'be a file in doc/source/*.rst (e.g. ' + '"contributing.rst" or a pandas function or ' + 'method (e.g. "pandas.DataFrame.head")') + else: + self.single_doc_type = 'docstring' + if single_doc.startswith('pandas.'): + self.single_doc = single_doc[len('pandas.'):] + else: + self.single_doc = single_doc + + def _copy_generated_docstring(self): + """Copy existing generated (from api.rst) docstring page because + this is more correct in certain cases (where a custom autodoc + template is used). + + """ + fname = os.path.join(SOURCE_PATH, 'generated', + 'pandas.{}.rst'.format(self.single_doc)) + temp_dir = os.path.join(SOURCE_PATH, 'generated_single') -def check_build(): - build_dirs = [ - 'build', 'build/doctrees', 'build/html', - 'build/latex', 'build/plots', 'build/_static', - 'build/_templates'] - for d in build_dirs: try: - os.mkdir(d) + os.makedirs(temp_dir) except OSError: pass + if os.path.exists(fname): + try: + # copying to make sure sphinx always thinks it is new + # and needs to be re-generated (to pick source code changes) + shutil.copy(fname, temp_dir) + except: # noqa + pass + + def _generate_index(self): + """Create index.rst file with the specified sections.""" + if self.single_doc_type == 'docstring': + self._copy_generated_docstring() + + with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: + t = jinja2.Template(f.read()) + + with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: + f.write(t.render(include_api=self.include_api, + single_doc=self.single_doc, + single_doc_type=self.single_doc_type)) + + @staticmethod + def _create_build_structure(): + """Create directories required to build documentation.""" + for dirname in BUILD_DIRS: + try: + os.makedirs(os.path.join(BUILD_PATH, dirname)) + except OSError: + pass + + @staticmethod + def _run_os(*args): + """Execute a command as a OS terminal. + + Parameters + ---------- + *args : list of str + Command and parameters to be executed + + Examples + -------- + >>> DocBuilder()._run_os('python', '--version') + """ + # TODO check_call should be more safe, but it fails with + # exclude patterns, needs investigation + # subprocess.check_call(args, stderr=subprocess.STDOUT) + os.system(' '.join(args)) + + def _sphinx_build(self, kind): + """Call sphinx to build documentation. + + Attribute `num_jobs` from the class is used. + + Parameters + ---------- + kind : {'html', 'latex'} + + Examples + -------- + >>> DocBuilder(num_jobs=4)._sphinx_build('html') + """ + if kind not in ('html', 'latex', 'spelling'): + raise ValueError('kind must be html, latex or ' + 'spelling, not {}'.format(kind)) + + self._run_os('sphinx-build', + '-j{}'.format(self.num_jobs), + '-b{}'.format(kind), + '-{}'.format( + 'v' * self.verbosity) if self.verbosity else '', + '-d{}'.format(os.path.join(BUILD_PATH, 'doctrees')), + '-Dexclude_patterns={}'.format(self.exclude_patterns), + SOURCE_PATH, + os.path.join(BUILD_PATH, kind)) + + def _open_browser(self): + base_url = os.path.join('file://', DOC_PATH, 'build', 'html') + if self.single_doc_type == 'docstring': + url = os.path.join( + base_url, + 'generated_single', 'pandas.{}.html'.format(self.single_doc)) + else: + url = os.path.join(base_url, '{}.html'.format(self.single_doc)) + webbrowser.open(url, new=2) + + def html(self): + """Build HTML documentation.""" + self._create_build_structure() + with _maybe_exclude_notebooks(): + self._sphinx_build('html') + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + + if self.single_doc is not None: + self._open_browser() + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'), + ignore_errors=True) + + def latex(self, force=False): + """Build PDF documentation.""" + self._create_build_structure() + if sys.platform == 'win32': + sys.stderr.write('latex build has not been tested on windows\n') + else: + self._sphinx_build('latex') + os.chdir(os.path.join(BUILD_PATH, 'latex')) + if force: + for i in range(3): + self._run_os('pdflatex', + '-interaction=nonstopmode', + 'pandas.tex') + raise SystemExit('You should check the file ' + '"build/latex/pandas.pdf" for problems.') + else: + self._run_os('make') + + def latex_forced(self): + """Build PDF documentation with retries to find missing references.""" + self.latex(force=True) + + @staticmethod + def clean(): + """Clean documentation generated files.""" + shutil.rmtree(BUILD_PATH, ignore_errors=True) + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated'), + ignore_errors=True) + + def zip_html(self): + """Compress HTML documentation into a zip file.""" + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + dirname = os.path.join(BUILD_PATH, 'html') + fnames = os.listdir(dirname) + os.chdir(dirname) + self._run_os('zip', + zip_fname, + '-r', + '-q', + *fnames) + + def spellcheck(self): + """Spell check the documentation.""" + self._sphinx_build('spelling') + output_location = os.path.join('build', 'spelling', 'output.txt') + with open(output_location) as output: + lines = output.readlines() + if lines: + raise SyntaxError( + 'Found misspelled words.' + ' Check pandas/doc/build/spelling/output.txt' + ' for more details.') -def all(): - # clean() - html() - - -def auto_dev_build(debug=False): - msg = '' - try: - step = 'clean' - clean() - step = 'html' - html() - step = 'upload dev' - upload_dev() - if not debug: - sendmail(step) - - step = 'latex' - latex() - step = 'upload pdf' - upload_dev_pdf() - if not debug: - sendmail(step) - except (Exception, SystemExit) as inst: - msg = str(inst) + '\n' - sendmail(step, '[ERROR] ' + msg) - - -def sendmail(step=None, err_msg=None): - from_name, to_name = _get_config() - - if step is None: - step = '' - - if err_msg is None or '[ERROR]' not in err_msg: - msgstr = 'Daily docs %s completed successfully' % step - subject = "DOC: %s successful" % step - else: - msgstr = err_msg - subject = "DOC: %s failed" % step - - import smtplib - from email.MIMEText import MIMEText - msg = MIMEText(msgstr) - msg['Subject'] = subject - msg['From'] = from_name - msg['To'] = to_name - - server_str, port, login, pwd = _get_credentials() - server = smtplib.SMTP(server_str, port) - server.ehlo() - server.starttls() - server.ehlo() - - server.login(login, pwd) - try: - server.sendmail(from_name, to_name, msg.as_string()) - finally: - server.close() - - -def _get_dir(subdir=None): - import getpass - USERNAME = getpass.getuser() - if sys.platform == 'darwin': - HOME = '/Users/%s' % USERNAME - else: - HOME = '/home/%s' % USERNAME - - if subdir is None: - subdir = '/code/scripts/config' - conf_dir = '%s/%s' % (HOME, subdir) - return conf_dir - - -def _get_credentials(): - tmp_dir = _get_dir() - cred = '%s/credentials' % tmp_dir - with open(cred, 'r') as fh: - server, port, un, domain = fh.read().split(',') - port = int(port) - login = un + '@' + domain + '.com' - - import base64 - with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: - pwd = base64.b64decode(fh.read()) - - return server, port, login, pwd - - -def _get_config(): - tmp_dir = _get_dir() - with open('%s/addresses' % tmp_dir, 'r') as fh: - from_name, to_name = fh.read().split(',') - return from_name, to_name - -funcd = { - 'html': html, - 'zip_html': zip_html, - 'upload_dev': upload_dev, - 'upload_stable': upload_stable, - 'upload_dev_pdf': upload_dev_pdf, - 'upload_stable_pdf': upload_stable_pdf, - 'latex': latex, - 'latex_forced': latex_forced, - 'clean': clean, - 'auto_dev': auto_dev_build, - 'auto_debug': lambda: auto_dev_build(True), - 'build_pandas': build_pandas, - 'all': all, -} - -small_docs = False - -# current_dir = os.getcwd() -# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) - -import argparse -argparser = argparse.ArgumentParser(description=""" -pandas documentation builder -""".strip()) - -# argparser.add_argument('-arg_name', '--arg_name', -# metavar='label for arg help', -# type=str|etc, -# nargs='N|*|?|+|argparse.REMAINDER', -# required=False, -# #choices='abc', -# help='help string', -# action='store|store_true') - -# args = argparser.parse_args() - -#print args.accumulate(args.integers) - -def generate_index(api=True, single=False, **kwds): - from jinja2 import Template - with open("source/index.rst.template") as f: - t = Template(f.read()) - - with open("source/index.rst","w") as f: - f.write(t.render(api=api,single=single,**kwds)) - -import argparse -argparser = argparse.ArgumentParser(description="pandas documentation builder", - epilog="Targets : %s" % funcd.keys()) - -argparser.add_argument('--no-api', - default=False, - help='Ommit api and autosummary', - action='store_true') -argparser.add_argument('--single', - metavar='FILENAME', - type=str, - default=False, - help='filename of section to compile, e.g. "indexing"') -argparser.add_argument('--user', - type=str, - default=False, - help='Username to connect to the pydata server') def main(): - args, unknown = argparser.parse_known_args() - sys.argv = [sys.argv[0]] + unknown - if args.single: - args.single = os.path.basename(args.single).split(".rst")[0] - - if 'clean' in unknown: - args.single=False - - generate_index(api=not args.no_api and not args.single, single=args.single) - - if len(sys.argv) > 2: - ftype = sys.argv[1] - ver = sys.argv[2] - - if ftype == 'build_previous': - build_prev(ver, user=args.user) - if ftype == 'upload_previous': - upload_prev(ver, user=args.user) - elif len(sys.argv) == 2: - for arg in sys.argv[1:]: - func = funcd.get(arg) - if func is None: - raise SystemExit('Do not know how to handle %s; valid args are %s' % ( - arg, list(funcd.keys()))) - if args.user: - func(user=args.user) - else: - func() - else: - small_docs = False - all() -# os.chdir(current_dir) + cmds = [method for method in dir(DocBuilder) if not method.startswith('_')] + + argparser = argparse.ArgumentParser( + description='pandas documentation builder', + epilog='Commands: {}'.format(','.join(cmds))) + argparser.add_argument('command', + nargs='?', + default='html', + help='command to run: {}'.format(', '.join(cmds))) + argparser.add_argument('--num-jobs', + type=int, + default=1, + help='number of jobs used by sphinx-build') + argparser.add_argument('--no-api', + default=False, + help='ommit api and autosummary', + action='store_true') + argparser.add_argument('--single', + metavar='FILENAME', + type=str, + default=None, + help=('filename of section or method name to ' + 'compile, e.g. "indexing", "DataFrame.join"')) + argparser.add_argument('--python-path', + type=str, + default=os.path.dirname(DOC_PATH), + help='path') + argparser.add_argument('-v', action='count', dest='verbosity', default=0, + help=('increase verbosity (can be repeated), ' + 'passed to the sphinx build command')) + args = argparser.parse_args() + + if args.command not in cmds: + raise ValueError('Unknown command {}. Available options: {}'.format( + args.command, ', '.join(cmds))) + + # Below we update both os.environ and sys.path. The former is used by + # external libraries (namely Sphinx) to compile this module and resolve + # the import of `python_path` correctly. The latter is used to resolve + # the import within the module, injecting it into the global namespace + os.environ['PYTHONPATH'] = args.python_path + sys.path.append(args.python_path) + globals()['pandas'] = importlib.import_module('pandas') + + builder = DocBuilder(args.num_jobs, not args.no_api, args.single, + args.verbosity) + getattr(builder, args.command)() + if __name__ == '__main__': - import sys sys.exit(main()) diff --git a/doc/plots/stats/moment_plots.py b/doc/plots/stats/moment_plots.py deleted file mode 100644 index 9e3a902592c6b..0000000000000 --- a/doc/plots/stats/moment_plots.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np - -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - - -def test_series(n=1000): - t.N = n - s = t.makeTimeSeries() - return s - - -def plot_timeseries(*args, **kwds): - n = len(args) - - fig, axes = plt.subplots(n, 1, figsize=kwds.get('size', (10, 5)), - sharex=True) - titles = kwds.get('titles', None) - - for k in range(1, n + 1): - ax = axes[k - 1] - ts = args[k - 1] - ax.plot(ts.index, ts.values) - - if titles: - ax.set_title(titles[k - 1]) - - fig.autofmt_xdate() - fig.subplots_adjust(bottom=0.10, top=0.95) diff --git a/doc/plots/stats/moments_ewma.py b/doc/plots/stats/moments_ewma.py deleted file mode 100644 index 3e521ed60bb8f..0000000000000 --- a/doc/plots/stats/moments_ewma.py +++ /dev/null @@ -1,15 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 200 -s = t.makeTimeSeries().cumsum() - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, s.values) -plt.plot(s.index, m.ewma(s, 20, min_periods=1).values) -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_ewmvol.py b/doc/plots/stats/moments_ewmvol.py deleted file mode 100644 index 093f62868fc4e..0000000000000 --- a/doc/plots/stats/moments_ewmvol.py +++ /dev/null @@ -1,23 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 500 -ts = t.makeTimeSeries() -ts[::100] = 20 - -s = ts.cumsum() - - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, m.ewmvol(s, span=50, min_periods=1).values, color='b') -plt.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') - -plt.title('Exp-weighted std with shocks') -plt.legend(('Exp-weighted', 'Equal-weighted')) - -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_expw.py b/doc/plots/stats/moments_expw.py deleted file mode 100644 index 5fff419b3a940..0000000000000 --- a/doc/plots/stats/moments_expw.py +++ /dev/null @@ -1,35 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series(500) * 10 - -# ts[::100] = 20 - -s = ts.cumsum() - -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.ewma(s, span=50, min_periods=1).values, color='b') -ax1.plot(s.index, m.rolling_mean(s, 50, min_periods=1).values, color='r') -ax1.set_title('rolling_mean vs. ewma') - -line1 = ax2.plot( - s.index, m.ewmstd(s, span=50, min_periods=1).values, color='b') -line2 = ax2.plot( - s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') -ax2.set_title('rolling_std vs. ewmstd') - -fig.legend((line1, line2), - ('Exp-weighted', 'Equal-weighted'), - loc='upper right') -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling.py b/doc/plots/stats/moments_rolling.py deleted file mode 100644 index 30a6c5f53e20c..0000000000000 --- a/doc/plots/stats/moments_rolling.py +++ /dev/null @@ -1,24 +0,0 @@ -from moment_plots import * - -ts = test_series() -s = ts.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -plot_timeseries(s, - m.rolling_count(s, 50), - m.rolling_sum(s, 50, min_periods=10), - m.rolling_mean(s, 50, min_periods=10), - m.rolling_std(s, 50, min_periods=10), - m.rolling_skew(s, 50, min_periods=10), - m.rolling_kurt(s, 50, min_periods=10), - size=(10, 12), - titles=('time series', - 'rolling_count', - 'rolling_sum', - 'rolling_mean', - 'rolling_std', - 'rolling_skew', - 'rolling_kurt')) -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling_binary.py b/doc/plots/stats/moments_rolling_binary.py deleted file mode 100644 index ab6b7b1c8ff49..0000000000000 --- a/doc/plots/stats/moments_rolling_binary.py +++ /dev/null @@ -1,30 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series() -s = ts.cumsum() -ts2 = test_series() -s2 = ts2.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.plot(s2.index, s2.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.rolling_corr(s, s2, 50, min_periods=1).values) -ax1.set_title('rolling_corr') - -ax2.plot(s.index, m.rolling_cov(s, s2, 50, min_periods=1).values) -ax2.set_title('rolling_cov') - -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 3cf05698dd9d0..fbbe94a72c71e 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -11,10 +11,7 @@ np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) import matplotlib - try: - matplotlib.style.use('ggplot') - except AttributeError: - pd.options.display.mpl_style = 'default' + # matplotlib.style.use('default') pd.options.display.max_rows = 15 #### portions of this were borrowed from the @@ -28,7 +25,7 @@ ******************** This is a short introduction to pandas, geared mainly for new users. -You can see more complex recipes in the :ref:`Cookbook` +You can see more complex recipes in the :ref:`Cookbook`. Customarily, we import as follows: @@ -41,7 +38,7 @@ Customarily, we import as follows: Object Creation --------------- -See the :ref:`Data Structure Intro section ` +See the :ref:`Data Structure Intro section `. Creating a :class:`Series` by passing a list of values, letting pandas create a default integer index: @@ -51,7 +48,7 @@ a default integer index: s = pd.Series([1,3,5,np.nan,6,8]) s -Creating a :class:`DataFrame` by passing a numpy array, with a datetime index +Creating a :class:`DataFrame` by passing a NumPy array, with a datetime index and labeled columns: .. ipython:: python @@ -73,7 +70,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'F' : 'foo' }) df2 -Having specific :ref:`dtypes ` +The columns of the resulting ``DataFrame`` have different +:ref:`dtypes `. .. ipython:: python @@ -87,29 +85,18 @@ will be completed: @verbatim In [1]: df2. - df2.A df2.boxplot - df2.abs df2.C - df2.add df2.clip - df2.add_prefix df2.clip_lower - df2.add_suffix df2.clip_upper - df2.align df2.columns - df2.all df2.combine - df2.any df2.combineAdd + df2.A df2.bool + df2.abs df2.boxplot + df2.add df2.C + df2.add_prefix df2.clip + df2.add_suffix df2.clip_lower + df2.align df2.clip_upper + df2.all df2.columns + df2.any df2.combine df2.append df2.combine_first - df2.apply df2.combineMult - df2.applymap df2.compound - df2.as_blocks df2.consolidate - df2.asfreq df2.convert_objects - df2.as_matrix df2.copy - df2.astype df2.corr - df2.at df2.corrwith - df2.at_time df2.count - df2.axes df2.cov - df2.B df2.cummax - df2.between_time df2.cummin - df2.bfill df2.cumprod - df2.blocks df2.cumsum - df2.bool df2.D + df2.apply df2.compound + df2.applymap df2.consolidate + df2.D As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically tab completed. ``E`` is there as well; the rest of the attributes have been @@ -118,16 +105,16 @@ truncated for brevity. Viewing Data ------------ -See the :ref:`Basics section ` +See the :ref:`Basics section `. -See the top & bottom rows of the frame +Here is how to view the top and bottom rows of the frame: .. ipython:: python df.head() df.tail(3) -Display the index, columns, and the underlying numpy data +Display the index, columns, and the underlying NumPy data: .. ipython:: python @@ -135,25 +122,25 @@ Display the index, columns, and the underlying numpy data df.columns df.values -Describe shows a quick statistic summary of your data +:func:`~DataFrame.describe` shows a quick statistic summary of your data: .. ipython:: python df.describe() -Transposing your data +Transposing your data: .. ipython:: python df.T -Sorting by an axis +Sorting by an axis: .. ipython:: python df.sort_index(axis=1, ascending=False) -Sorting by values +Sorting by values: .. ipython:: python @@ -167,15 +154,15 @@ Selection While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, ``.at``, ``.iat``, - ``.loc``, ``.iloc`` and ``.ix``. + ``.loc`` and ``.iloc``. -See the indexing documentation :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing ` +See the indexing documentation :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing `. Getting ~~~~~~~ Selecting a single column, which yields a ``Series``, -equivalent to ``df.A`` +equivalent to ``df.A``: .. ipython:: python @@ -191,39 +178,39 @@ Selecting via ``[]``, which slices the rows. Selection by Label ~~~~~~~~~~~~~~~~~~ -See more in :ref:`Selection by Label ` +See more in :ref:`Selection by Label `. -For getting a cross section using a label +For getting a cross section using a label: .. ipython:: python df.loc[dates[0]] -Selecting on a multi-axis by label +Selecting on a multi-axis by label: .. ipython:: python df.loc[:,['A','B']] -Showing label slicing, both endpoints are *included* +Showing label slicing, both endpoints are *included*: .. ipython:: python df.loc['20130102':'20130104',['A','B']] -Reduction in the dimensions of the returned object +Reduction in the dimensions of the returned object: .. ipython:: python df.loc['20130102',['A','B']] -For getting a scalar value +For getting a scalar value: .. ipython:: python df.loc[dates[0],'A'] -For getting fast access to a scalar (equiv to the prior method) +For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python @@ -232,45 +219,45 @@ For getting fast access to a scalar (equiv to the prior method) Selection by Position ~~~~~~~~~~~~~~~~~~~~~ -See more in :ref:`Selection by Position ` +See more in :ref:`Selection by Position `. -Select via the position of the passed integers +Select via the position of the passed integers: .. ipython:: python df.iloc[3] -By integer slices, acting similar to numpy/python +By integer slices, acting similar to numpy/python: .. ipython:: python df.iloc[3:5,0:2] -By lists of integer position locations, similar to the numpy/python style +By lists of integer position locations, similar to the numpy/python style: .. ipython:: python df.iloc[[1,2,4],[0,2]] -For slicing rows explicitly +For slicing rows explicitly: .. ipython:: python df.iloc[1:3,:] -For slicing columns explicitly +For slicing columns explicitly: .. ipython:: python df.iloc[:,1:3] -For getting a value explicitly +For getting a value explicitly: .. ipython:: python df.iloc[1,1] -For getting fast access to a scalar (equiv to the prior method) +For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python @@ -285,7 +272,7 @@ Using a single column's values to select data. df[df.A > 0] -A ``where`` operation for getting. +Selecting values from a DataFrame where a boolean condition is met. .. ipython:: python @@ -304,7 +291,7 @@ Setting ~~~~~~~ Setting a new column automatically aligns the data -by the indexes +by the indexes. .. ipython:: python @@ -312,25 +299,25 @@ by the indexes s1 df['F'] = s1 -Setting values by label +Setting values by label: .. ipython:: python df.at[dates[0],'A'] = 0 -Setting values by position +Setting values by position: .. ipython:: python df.iat[0,1] = 0 -Setting by assigning with a numpy array +Setting by assigning with a NumPy array: .. ipython:: python df.loc[:,'D'] = np.array([5] * len(df)) -The result of the prior setting operations +The result of the prior setting operations. .. ipython:: python @@ -350,7 +337,7 @@ Missing Data pandas primarily uses the value ``np.nan`` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section -` +`. Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data. @@ -367,36 +354,36 @@ To drop any rows that have missing data. df1.dropna(how='any') -Filling missing data +Filling missing data. .. ipython:: python df1.fillna(value=5) -To get the boolean mask where values are ``nan`` +To get the boolean mask where values are ``nan``. .. ipython:: python - pd.isnull(df1) + pd.isna(df1) Operations ---------- -See the :ref:`Basic section on Binary Ops ` +See the :ref:`Basic section on Binary Ops `. Stats ~~~~~ Operations in general *exclude* missing data. -Performing a descriptive statistic +Performing a descriptive statistic: .. ipython:: python df.mean() -Same operation on the other axis +Same operation on the other axis: .. ipython:: python @@ -415,7 +402,7 @@ In addition, pandas automatically broadcasts along the specified dimension. Apply ~~~~~ -Applying functions to the data +Applying functions to the data: .. ipython:: python @@ -425,7 +412,7 @@ Applying functions to the data Histogramming ~~~~~~~~~~~~~ -See more at :ref:`Histogramming and Discretization ` +See more at :ref:`Histogramming and Discretization `. .. ipython:: python @@ -439,7 +426,7 @@ String Methods Series is equipped with a set of string processing methods in the `str` attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in `str` generally uses `regular -expressions `__ by default (and in +expressions `__ by default (and in some cases always uses them). See more at :ref:`Vectorized String Methods `. @@ -459,7 +446,7 @@ DataFrame, and Panel objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. -See the :ref:`Merging section ` +See the :ref:`Merging section `. Concatenating pandas objects together with :func:`concat`: @@ -476,7 +463,7 @@ Concatenating pandas objects together with :func:`concat`: Join ~~~~ -SQL style merges. See the :ref:`Database style joining ` +SQL style merges. See the :ref:`Database style joining ` section. .. ipython:: python @@ -486,10 +473,22 @@ SQL style merges. See the :ref:`Database style joining ` right pd.merge(left, right, on='key') +Another example that can be given is: + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left + right + pd.merge(left, right, on='key') + + Append ~~~~~~ -Append rows to a dataframe. See the :ref:`Appending ` +Append rows to a dataframe. See the :ref:`Appending ` +section. .. ipython:: python @@ -503,13 +502,13 @@ Grouping -------- By "group by" we are referring to a process involving one or more of the -following steps +following steps: - **Splitting** the data into groups based on some criteria - **Applying** a function to each group independently - **Combining** the results into a data structure -See the :ref:`Grouping section ` +See the :ref:`Grouping section `. .. ipython:: python @@ -521,14 +520,15 @@ See the :ref:`Grouping section ` 'D' : np.random.randn(8)}) df -Grouping and then applying a function ``sum`` to the resulting groups. +Grouping and then applying the :meth:`~DataFrame.sum` function to the resulting +groups. .. ipython:: python df.groupby('A').sum() -Grouping by multiple columns forms a hierarchical index, which we then apply -the function. +Grouping by multiple columns forms a hierarchical index, and again we can +apply the ``sum`` function. .. ipython:: python @@ -598,15 +598,15 @@ Time Series pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, -financial applications. See the :ref:`Time Series section ` +financial applications. See the :ref:`Time Series section `. .. ipython:: python rng = pd.date_range('1/1/2012', periods=100, freq='S') ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min', how='sum') + ts.resample('5Min').sum() -Time zone representation +Time zone representation: .. ipython:: python @@ -616,13 +616,13 @@ Time zone representation ts_utc = ts.tz_localize('UTC') ts_utc -Convert to another time zone +Converting to another time zone: .. ipython:: python ts_utc.tz_convert('US/Eastern') -Converting between time span representations +Converting between time span representations: .. ipython:: python @@ -648,7 +648,7 @@ the quarter end: Categoricals ------------ -Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the +pandas can include categorical data in a ``DataFrame``. For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. .. ipython:: python @@ -662,14 +662,15 @@ Convert the raw grades to a categorical data type. df["grade"] = df["raw_grade"].astype("category") df["grade"] -Rename the categories to more meaningful names (assigning to ``Series.cat.categories`` is inplace!) +Rename the categories to more meaningful names (assigning to +``Series.cat.categories`` is inplace!). .. ipython:: python df["grade"].cat.categories = ["very good", "good", "very bad"] Reorder the categories and simultaneously add the missing categories (methods under ``Series -.cat`` return a new ``Series`` per default). +.cat`` return a new ``Series`` by default). .. ipython:: python @@ -682,7 +683,7 @@ Sorting is per order in the categories, not lexical order. df.sort_values(by="grade") -Grouping by a categorical column shows also empty categories. +Grouping by a categorical column also shows empty categories. .. ipython:: python @@ -692,7 +693,7 @@ Grouping by a categorical column shows also empty categories. Plotting -------- -:ref:`Plotting ` docs. +See the :ref:`Plotting ` docs. .. ipython:: python :suppress: @@ -708,8 +709,8 @@ Plotting @savefig series_plot_basic.png ts.plot() -On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the -columns with labels: +On a DataFrame, the :meth:`~DataFrame.plot` method is a convenience to plot all +of the columns with labels: .. ipython:: python @@ -726,13 +727,13 @@ Getting Data In/Out CSV ~~~ -:ref:`Writing to a csv file ` +:ref:`Writing to a csv file. ` .. ipython:: python df.to_csv('foo.csv') -:ref:`Reading from a csv file ` +:ref:`Reading from a csv file. ` .. ipython:: python @@ -746,15 +747,15 @@ CSV HDF5 ~~~~ -Reading and writing to :ref:`HDFStores ` +Reading and writing to :ref:`HDFStores `. -Writing to a HDF5 Store +Writing to a HDF5 Store. .. ipython:: python df.to_hdf('foo.h5','df') -Reading from a HDF5 Store +Reading from a HDF5 Store. .. ipython:: python @@ -768,15 +769,15 @@ Reading from a HDF5 Store Excel ~~~~~ -Reading and writing to :ref:`MS Excel ` +Reading and writing to :ref:`MS Excel `. -Writing to an excel file +Writing to an excel file. .. ipython:: python df.to_excel('foo.xlsx', sheet_name='Sheet1') -Reading from an excel file +Reading from an excel file. .. ipython:: python @@ -790,7 +791,7 @@ Reading from an excel file Gotchas ------- -If you are trying an operation and you see an exception like: +If you are attempting to perform an operation you might see an exception like: .. code-block:: python diff --git a/doc/source/_static/banklist.html b/doc/source/_static/banklist.html index 8ec1561f8c394..cbcce5a2d49ff 100644 --- a/doc/source/_static/banklist.html +++ b/doc/source/_static/banklist.html @@ -7,7 +7,7 @@ - + @@ -4849,7 +4849,7 @@

Failed Bank List