diff --git a/.gitignore b/.gitignore index f7c6b05..f01bf7a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,4 @@ __pycache__ dist/ build/ docs/_build -env/ +.venv/ \ No newline at end of file diff --git a/Makefile b/Makefile index 9ff983a..78e55f3 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,35 @@ .PHONY = venv, lint, test, clean, release venv: - python3.8 -m venv env + python3.8 -m venv .venv install: venv - env/bin/python -m pip install --upgrade pip - env/bin/pip install -e ".[dev]" + .venv/bin/python -m pip install --upgrade pip + .venv/bin/pip install -e ".[dev]" lint_ruff: - env/bin/ruff check epidatpy tests + .venv/bin/ruff check epidatpy tests lint_mypy: - env/bin/mypy epidatpy tests + .venv/bin/mypy epidatpy tests lint_pylint: - env/bin/pylint epidatpy tests + .venv/bin/pylint epidatpy tests lint: lint_ruff lint_mypy lint_pylint format: - env/bin/ruff format epidatpy tests + .venv/bin/ruff format epidatpy tests test: - env/bin/pytest . + .venv/bin/pytest . -docs: - env/bin/sphinx-build -b html docs docs/_build - env/bin/python -m webbrowser -t "docs/_build/index.html" +doc: + @pandoc --version >/dev/null 2>&1 || (echo "ERROR: pandoc is required (install via your platform's package manager)"; exit 1) + .venv/bin/sphinx-build -b html docs docs/_build + .venv/bin/python -m webbrowser -t "docs/_build/index.html" -clean_docs: +clean_doc: rm -rf docs/_build clean_build: @@ -41,10 +42,10 @@ clean_python: find . -name '*.pyo' -exec rm -f {} + find . -name '__pycache__' -exec rm -fr {} + -clean: clean_docs clean_build clean_python +clean: clean_doc clean_build clean_python release: clean lint test - env/bin/python -m build --sdist --wheel + .venv/bin/python -m build --sdist --wheel upload: release - env/bin/twine upload dist/* + .venv/bin/twine upload dist/* diff --git a/README.md b/README.md index a366317..04a79b4 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![License: MIT][mit-image]][mit-url] [![Github Actions][github-actions-image]][github-actions-url] [![PyPi][pypi-image]][pypi-url] [![Read the Docs][docs-image]][docs-url] -A Python client for the [Delphi Epidata API](https://cmu-delphi.github.io/delphi-epidata/). Still in development. +The Python client for the [Delphi Epidata API](https://cmu-delphi.github.io/delphi-epidata/). ## Install @@ -18,7 +18,23 @@ pip install epidatpy ## Usage -TODO +```py +from epidatpy import CovidcastEpidata, EpiDataContext, EpiRange + +# All calls using the `epidata` object will now be cached for 7 days +epidata = EpiDataContext(use_cache=True, cache_max_age_days=7) + +# Obtain a DataFrame of the most up-to-date version of the smoothed covid-like illness (CLI) +# signal from the COVID-19 Trends and Impact survey for the US +epidata.pub_covidcast( + data_source="jhu-csse", + signals="confirmed_cumulative_num", + geo_type="nation", + time_type="day", + geo_values="us", + time_values=EpiRange(20210405, 20210410), +).df() +``` ## Development @@ -35,6 +51,21 @@ make release # upload the current version to pypi make clean # clean build and docs artifacts ``` +Building the documentation additionally requires the Pandoc package. These +commands can be used to install the package on common platforms (see the +[official documentation](https://pandoc.org/installing.html) for more options): + +```sh +# Linux (Debian/Ubuntu) +sudo apt-get install pandoc + +# OS X / Linux (with Homebrew) +brew install pandoc + +# Windows (with Chocolatey) +choco install pandoc +``` + ### Release Process The release consists of multiple steps which can be all done via the GitHub website: diff --git a/docs/conf.py b/docs/conf.py index 20fc7be..268f542 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,11 +31,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx_autodoc_typehints", - # 'matplotlib.sphinxext.plot_directive' -] +extensions = ["sphinx.ext.autodoc", "sphinx_autodoc_typehints", "nbsphinx"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -84,3 +80,8 @@ # https://pypi.org/project/sphinx-autodoc-typehints/ always_document_param_types = True + +# https://nbsphinx.readthedocs.io/ +nbsphinx_prompt_width = 0 +nbsphinx_input_prompt = "%.0s" +nbsphinx_output_prompt = "%.0s" diff --git a/docs/covidcast_examples.rst b/docs/covidcast_examples.rst deleted file mode 100644 index e02329d..0000000 --- a/docs/covidcast_examples.rst +++ /dev/null @@ -1,60 +0,0 @@ -Basic examples --------------- - -To obtain all available sources of epidemiological data, we can use the following command: - ->>> from delphi_epidata.request import CovidcastEpidata, EpiRange ->>> epidata = CovidcastEpidata() ->>> print(list(epidata.source_names)) -['chng-cli', 'chng-covid', 'covid-act-now', 'doctor-visits', 'fb-survey', 'google-symptoms', 'hhs', 'hospital-admissions', 'indicator-combination-cases-deaths', 'jhu-csse', 'quidel-covid-ag', 'safegraph-weekly', 'usa-facts', 'ght', 'google-survey', 'indicator-combination-nmf', 'quidel-flu', 'safegraph-daily', 'nchs-mortality'] - - -To obtain smoothed estimates of COVID-like illness from our symptom survey, -distributed through Facebook (`fb-survey`), for every county in the United States between -2020-05-01 and 2020-05-07: - ->>> from delphi_epidata.request import EpiRange ->>> apicall = epidata[("fb-survey", "smoothed_cli")].call( -... 'county', "*", EpiRange(20200501, 20200507), -... ) -EpiDataCall(endpoint=covidcast, params={'data_source': 'fb-survey', 'signals': 'smoothed_cli', 'time_type': 'day', 'time_values': '20200501-20200507', 'geo_type': 'county', 'geo_values': '*'}) ->>> data = apicall.df() ->>> data.head() - source signal geo_type geo_value time_type time_value issue lag value stderr sample_size direction missing_value missing_stderr missing_sample_size -0 fb-survey smoothed_cli county 01000 day 2020-05-01 2020-09-03 125 0.825410 0.136003 1722 NaN 0 0 0 -1 fb-survey smoothed_cli county 01001 day 2020-05-01 2020-09-03 125 1.299425 0.967136 115 NaN 0 0 0 -2 fb-survey smoothed_cli county 01003 day 2020-05-01 2020-09-03 125 0.696597 0.324753 584 NaN 0 0 0 -3 fb-survey smoothed_cli county 01015 day 2020-05-01 2020-09-03 125 0.428271 0.548566 122 NaN 0 0 0 -4 fb-survey smoothed_cli county 01031 day 2020-05-01 2020-09-03 125 0.025579 0.360827 114 NaN 0 0 0 - -Each row represents one observation in one county on one day. The county FIPS -code is given in the ``geo_value`` column, the date in the ``time_value`` -column. Here ``value`` is the requested signal---in this case, the smoothed -estimate of the percentage of people with COVID-like illness, based on the -symptom surveys. ``stderr`` is its standard error. The ``issue`` column -indicates when this data was reported; in this case, the survey estimates for -May 1st were updated on September 3rd based on new data, giving a ``lag`` of 125 days. -See the `Delphi Epidata API `_ documentation for details on all fields of the returned data frame. - -The API documentation lists each available signal and provides technical details -on how it is estimated and how its standard error is calculated. In this case, -for example, the `symptom surveys documentation page -`_ -explains the definition of "COVID-like illness", links to the exact survey text, -and describes the mathematical derivation of the estimates. - -Using the ``geo_values`` argument, we can request data for a specific geography, -such as the state of Pennsylvania for the month of September 2021: - ->>> pa_data = epidata[("fb-survey", "smoothed_cli")].call( -... 'state', "pa", EpiRange(20210901, 20210930) -... ).df() ->>> pa_data.head() - source signal geo_type geo_value time_type time_value issue lag value stderr sample_size direction missing_value missing_stderr missing_sample_size -0 fb-survey smoothed_cli state pa day 2021-09-01 2021-09-06 5 0.928210 0.088187 9390 NaN 0 0 0 -1 fb-survey smoothed_cli state pa day 2021-09-02 2021-09-07 5 0.894603 0.087308 9275 NaN 0 0 0 -2 fb-survey smoothed_cli state pa day 2021-09-03 2021-09-08 5 0.922847 0.088324 9179 NaN 0 0 0 -3 fb-survey smoothed_cli state pa day 2021-09-04 2021-09-09 5 0.984799 0.092566 9069 NaN 0 0 0 -4 fb-survey smoothed_cli state pa day 2021-09-05 2021-09-10 5 1.010306 0.093357 9016 NaN 0 0 0 - -We can request multiple states by providing a list, such as ``["pa", "ny", "mo"]``. diff --git a/docs/epidatpy.rst b/docs/epidatpy.rst index f72e0d8..d3b7a1f 100644 --- a/docs/epidatpy.rst +++ b/docs/epidatpy.rst @@ -4,17 +4,6 @@ epidatpy Reference .. toctree:: :maxdepth: 4 -Submodules ----------- - -Module contents ---------------- - -.. automodule:: epidatpy - :members: - :undoc-members: - :show-inheritance: - epidatpy.request module ----------------------- @@ -23,11 +12,11 @@ epidatpy.request module :undoc-members: :show-inheritance: -epidatpy.async\_request module ------------------------------- +epidatpy._endpoints module +----------------------- -.. automodule:: epidatpy.async_request +.. automodule:: epidatpy._endpoints :members: :undoc-members: :show-inheritance: - + :exclude-members: get_wildcard_equivalent_dates diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb new file mode 100644 index 0000000..ec34864 --- /dev/null +++ b/docs/getting_started.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started\n", + "\n", + "The epidatpy package provides access to all the endpoints of the [Delphi Epidata\n", + "API](https://cmu-delphi.github.io/delphi-epidata/), and can be used to make\n", + "requests for specific signals on specific dates and in select geographic\n", + "regions.\n", + "\n", + "## Basic usage\n", + "\n", + "Fetching data from the Delphi Epidata API is simple. Suppose we are\n", + "interested in the [covidcast endpoint](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html),\n", + "which provides access to a [wide range of data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html)\n", + "on COVID-19. Reviewing the endpoint documentation, we see that we\n", + "[need to specify](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html#constructing-api-queries)\n", + "a data source name, a signal name, a geographic level, a time resolution, and\n", + "the location and times of interest.\n", + "\n", + "The `pub_covidcast` function lets us access the `covidcast` endpoint. Here we\n", + "demonstrate how to fetch the most up-to-date version of the confirmed cumulative COVID cases\n", + "from the JHU CSSE data source at the national level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Hidden cell (set in the metadata for this cell)\n", + "import pandas as pd\n", + "\n", + "# Set common options and context\n", + "pd.set_option(\"display.max_columns\", None)\n", + "pd.set_option(\"display.max_rows\", 10)\n", + "pd.set_option(\"display.width\", 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from epidatpy import CovidcastEpidata, EpiDataContext, EpiRange\n", + "\n", + "# Create the client object. Note that due to the arguments below all results\n", + "# will be cached to your disk for 7 days, which helps avoid making repeated\n", + "# downloads.\n", + "epidata = EpiDataContext(use_cache=True, cache_max_age_days=7)\n", + "\n", + "# `pub_covidcast` returns an `EpiDataCall`, which is a not-yet-executed query\n", + "# that can be inspected.\n", + "apicall = epidata.pub_covidcast(\n", + " data_source=\"jhu-csse\",\n", + " signals=\"confirmed_cumulative_num\",\n", + " geo_type=\"nation\",\n", + " time_type=\"day\",\n", + " geo_values=\"us\",\n", + " time_values=EpiRange(20210405, 20210410),\n", + ")\n", + "print(apicall)\n", + "# The query can be executed and converted to a DataFrame by using the `.df()`\n", + "# method:\n", + "apicall.df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the pub_covidcast-specific client object. This you to find what sources\n", + "# and signals are available without leaving your REPL.\n", + "covidcast = CovidcastEpidata(use_cache=True, cache_max_age_days=7)\n", + "# Get a list of all the sources available in the pub_covidcast endpoint.\n", + "print(covidcast.source_names())\n", + "print(covidcast.signal_names(\"jhu-csse\"))\n", + "# Obtain the same data as above with a different interface.\n", + "covidcast[\"jhu-csse\", \"confirmed_cumulative_num\"].call(\n", + " \"nation\",\n", + " \"us\",\n", + " EpiRange(20210405, 20210410),\n", + ").df()\n", + "# See the \"Finding data of interest\" notebook for more features of this interface." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each row represents one observation in the US on one\n", + "day. The geographical abbreviation is given in the `geo_value` column, the date in\n", + "the `time_value` column. Here `value` is the requested signal -- in this\n", + "case, the smoothed estimate of the percentage of people with COVID-like\n", + "illness, based on the symptom surveys, and `stderr` is its standard error.\n", + "\n", + "The Epidata API makes signals available at different geographic levels,\n", + "depending on the endpoint. To request signals for all states instead of the\n", + "entire US, we use the `geo_type` argument paired with `*` for the\n", + "`geo_values` argument. (Only some endpoints allow for the use of `*` to\n", + "access data at all locations. Check the help for a given endpoint to see if\n", + "it supports `*`.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"fb-survey\",\n", + " signals=\"smoothed_cli\",\n", + " geo_type=\"state\",\n", + " time_type=\"day\",\n", + " geo_values=\"*\",\n", + " time_values=EpiRange(20210405, 20210410),\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can fetch the full time series for a subset of states by \n", + "listing out the desired locations in the `geo_value` argument and using\n", + "`*` in the `time_values` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"fb-survey\",\n", + " signals=\"smoothed_cli\",\n", + " geo_type=\"state\",\n", + " time_type=\"day\",\n", + " geo_values=\"pa,ca,fl\",\n", + " time_values=\"*\",\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting versioned data\n", + "\n", + "The Epidata API stores a historical record of all data, including corrections\n", + "and updates, which is particularly useful for accurately backtesting\n", + "forecasting models. To fetch versioned data, we can use the `as_of`\n", + "argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"fb-survey\",\n", + " signals=\"smoothed_cli\",\n", + " geo_type=\"state\",\n", + " time_type=\"day\",\n", + " geo_values=\"pa\",\n", + " time_values=EpiRange(20210405, 20210410),\n", + " as_of=\"2021-06-01\",\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting\n", + "\n", + "Because the output data is a standard Pandas DataFrame, we can easily plot\n", + "it using any of the available Python libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.rcParams[\"figure.dpi\"] = 300\n", + "\n", + "apicall = epidata.pub_covidcast(\n", + " data_source=\"fb-survey\",\n", + " signals=\"smoothed_cli\",\n", + " geo_type=\"state\",\n", + " geo_values=\"pa,ca,fl\",\n", + " time_type=\"day\",\n", + " time_values=EpiRange(20210405, 20210410),\n", + ")\n", + "\n", + "fig, ax = plt.subplots(figsize=(6, 5))\n", + "ax.spines[\"right\"].set_visible(False)\n", + "ax.spines[\"left\"].set_visible(False)\n", + "ax.spines[\"top\"].set_visible(False)\n", + "\n", + "(\n", + " apicall.df()\n", + " .pivot_table(values=\"value\", index=\"time_value\", columns=\"geo_value\")\n", + " .plot(xlabel=\"Date\", ylabel=\"CLI\", ax=ax, linewidth=1.5)\n", + ")\n", + "\n", + "plt.title(\"Smoothed CLI from Facebook Survey\", fontsize=16)\n", + "plt.subplots_adjust(bottom=0.2)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Finding locations of interest\n", + "\n", + "Most data is only available for the US. Select endpoints report other countries at the national and/or regional levels. Endpoint descriptions explicitly state when they cover non-US locations.\n", + "\n", + "For endpoints that report US data, see the\n", + "[geographic coding documentation](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html)\n", + "for available geographic levels.\n", + "\n", + "## International data\n", + "\n", + "International data is available via\n", + "\n", + "- `pub_dengue_nowcast` (North and South America)\n", + "- `pub_ecdc_ili` (Europe)\n", + "- `pub_kcdc_ili` (Korea)\n", + "- `pub_nidss_dengue` (Taiwan)\n", + "- `pub_nidss_flu` (Taiwan)\n", + "- `pub_paho_dengue` (North and South America)\n", + "- `pvt_dengue_sensors` (North and South America)\n", + "\n", + "## Finding data sources and signals of interest\n", + "\n", + "Above we used data from [Delphi’s symptom surveys](https://delphi.cmu.edu/covid19/ctis/),\n", + "but the Epidata API includes numerous data streams: medical claims data, cases\n", + "and deaths, mobility, and many others. This can make it a challenge to find\n", + "the data stream that you are most interested in.\n", + "\n", + "The Epidata documentation lists all the data sources and signals available\n", + "through the API for [COVID-19](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html)\n", + "and for [other diseases](https://cmu-delphi.github.io/delphi-epidata/api/README.html#source-specific-parameters).\n", + "\n", + "## Epiweeks and dates\n", + "\n", + "Formatting for epiweeks is YYYYWW and for dates is YYYYMMDD.\n", + "\n", + "Epiweeks use the U.S. CDC definition, which defines the first epiweek each year\n", + "to be the first week containing January 4th and the start of the week is on\n", + "Sunday. See [this\n", + "page](https://www.cmmcp.org/mosquito-surveillance-data/pages/epi-week-calendars-2008-2021)\n", + "for a less terse explanation. \n", + "\n", + "When specifying the time_values argument, you can use individual values,\n", + "comma-separated lists or, a hyphenated range of values to specify single or\n", + "several dates (or epiweeks). An `EpiRange` object can be also used to construct\n", + "a range of epiweeks or dates. Examples include:\n", + "\n", + "- `param = 201530` (A single epiweek)\n", + "- `param = '201401,201501,201601'` (Several epiweeks)\n", + "- `param = '200501-200552'` (A range of epiweeks)\n", + "- `param = '201440,201501-201510'` (Several epiweeks, including a range)\n", + "- `param = EpiRange(20070101, 20071231)` (A range of dates)\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/getting_started.rst b/docs/getting_started.rst deleted file mode 100644 index a993e2a..0000000 --- a/docs/getting_started.rst +++ /dev/null @@ -1,276 +0,0 @@ -Getting Started -=============== - -Overview --------------- - -This package provides access to data from various Epidata API endpoints including COVIDcast, -which provides numerous COVID-related data streams, updated daily. - -.. _epidata-endpoints: - -Epidata Data Sources --------------------- -The parameters available for each source data are documented in each linked source-specific API page. - -**COVID-19 Data** - -.. list-table:: - :widths: 20 20 40 - :header-rows: 1 - - * - Endpoint - - Name - - Description - * - `pub_covidcast `_ - - COVIDcast - - Delphi’s COVID-19 surveillance streams. - * - `pub_covidcast_meta `_ - - COVIDcast metadata - - Metadata for Delphi's COVID-19 surveillance streams. - * - `pub_covid_hosp_facility `_ - - COVID-19 Hospitalization by Facility - - COVID-19 Reported Patient Impact and Hospital Capacity - Facility Lookup - * - `pub_covid_hosp `_ - - COVID-19 Hospitalization - - COVID-19 Reported Patient Impact and Hospital Capacity. - -**Influenza Data** - -.. list-table:: - :widths: 20 20 40 - :header-rows: 1 - - * - Endpoint - - Name - - Description - * - `pvt_cdc `_ - - CDC Page Hits - - ... - * - `pub_delphi `_ - - Delphi’s Forecast - - ... - * - `pub_ecdc_ili `_ - - ECDC ILI - - ECDC ILI data from the ECDC website. - * - `pub_flusurv `_ - - FluSurv - - FluSurv-NET data (flu hospitaliation rates) from CDC. - * - `pub_fluview `_ - - FluView - - Influenza-like illness (ILI) from U.S. Outpatient Influenza-like Illness Surveillance Network (ILINet). - * - `pub_fluview_meta `_ - - FluView Metadata - - Summary data about ``fluview``. - * - `pub_fluview_clinical `_ - - FluView Clinical - - ... - * - `pub_gft `_ - - Google Flu Trends - - Estimate of influenza activity based on volume of certain search queries. This is now a static endpoint due to discontinuation. - * - `pub_kcdc_ili `_ - - KCDC ILI - - KCDC ILI data from KCDC website. - * - `pub_meta `_ - - API Metadata - - Metadata for ``fluview``, ``twitter``, ``wiki``, and ``delphi``. - * - `pub_nidss_flu `_ - - NIDSS Flu - - Outpatient ILI from Taiwan's National Infectious Disease Statistics System (NIDSS). - * - `pub_nowcast `_ - - ILI Nearby - - A nowcast of U.S. national, regional, and state-level (weighted) percent ILI, available seven days (regionally) or five days (state-level) before the first ILINet report for the corresponding week. - * - `pvt_quidel `_ - - Quidel - - Data provided by Quidel Corp., which contains flu lab test results. - * - `pvt_sensors `_ - - Delphi's Digital Surveillance Sensors - - ... - * - `pvt_twitter `_ - - Twitter Stream - - Estimate of influenza activity based on analysis of language used in tweets from HealthTweets. - * - `pub_wiki `_ - - Wikipedia Access Logs - - Number of page visits for selected English, Influenza-related wikipedia articles. - -**Dengue Data** - -.. list-table:: - :widths: 20 20 40 - :header-rows: 1 - - * - Endpoint - - Name - - Description - * - `pub_dengue_nowcast `_ - - Delphi's Dengue Nowcast - - ... - * - `pvt_dengue_sensors `_ - - Delphi’s Dengue Digital Surveillance Sensors - - ... - * - `pub_nidss_dengue `_ - - NIDSS Dengue - - Counts of confirmed dengue cases from Taiwan's NIDSS. - * - `pub_paho_dengue `_ - - PAHO Dengue - - ... - -**Norovirus Data** - -.. list-table:: - :widths: 20 20 40 - :header-rows: 1 - - * - Endpoint - - Name - - Description - * - `pvt_meta_norostat `_ - - NoroSTAT Metadata - - ... - * - `pvt_norostat `_ - - NoroSTAT - - Suspected and confirmed norovirus outbreaks reported by state health departments to the CDC. - -Epiweeks and Dates ------------------- -Epiweeks use the U.S. definition. That is, the first epiweek each year is the -week, starting on a Sunday, containing January 4. See `this page -`_ -for more information. - -Formatting for epiweeks is YYYYWW and for dates is YYYYMMDD. - -Use individual values, comma-separated lists or, a hyphenated range of values to specify single or several dates. -An ``EpiRange`` object can be also used to construct a range of epiweeks or dates. Examples include: - -- ``param = 201530`` (A single epiweek) -- ``param = '201401,201501,201601'`` (Several epiweeks) -- ``param = '200501-200552'`` (A range of epiweeks) -- ``param = '201440,201501-201510'`` (Several epiweeks, including a range) -- ``param = EpiRange(20070101, 20071231)`` (A range of dates) - -.. _getting-started: - -Basic examples --------------- - -**COVIDcast** - -To obtain smoothed estimates of COVID-like illness from our symptom survey, -distributed through Facebook, for every county in the United States between -2020-05-01 and 2020-05-07: - ->>> from epidatpy.request import Epidata, EpiRange ->>> apicall = Epidata.covidcast("fb-survey", "smoothed_cli", -... "day", "county", -... EpiRange(20200501, 20200507), "*") ->>> data = apicall.df() ->>> data.head() - source signal geo_type geo_value time_type time_value issue lag value stderr sample_size direction missing_value missing_stderr missing_sample_size -0 fb-survey smoothed_cli county 01000 day 2020-05-01 2020-09-03 125 0.825410 0.136003 1722 None 0 0 0 -1 fb-survey smoothed_cli county 01001 day 2020-05-01 2020-09-03 125 1.299425 0.967136 115 None 0 0 0 -2 fb-survey smoothed_cli county 01003 day 2020-05-01 2020-09-03 125 0.696597 0.324753 584 None 0 0 0 -3 fb-survey smoothed_cli county 01015 day 2020-05-01 2020-09-03 125 0.428271 0.548566 122 None 0 0 0 -4 fb-survey smoothed_cli county 01031 day 2020-05-01 2020-09-03 125 0.025579 0.360827 114 None 0 0 0 - -Each row represents one observation in one county per day. The county FIPS -code is given in the ``geo_value`` column, and the date is given in the ``time_value`` -column. The ``value`` is the requested signal - the smoothed -estimate of the percentage of people with COVID-like illness based on the -symptom surveys. The ``issue`` column indicates when this data was reported; in this case, the survey estimates for -May 1st were updated on September 3rd based on new data, giving a ``lag`` of 125 days. -See the :py:func:`epidatpy.request.Epidata.covidcast` documentation for further details on the returned -columns. - -In the above code, the ``.df()`` function on the ``apicall`` variable generated a Pandas DataFrame. We can use -other :ref:`output functions ` to parse the requested API call in different formats. To parse the data -into JSON format, we can use the following command: - ->>> data = apicall.json() ->>> data -[{'geo_value': '01000', - 'signal': 'smoothed_cli', - 'source': 'fb-survey', - 'geo_type': 'county', - 'time_type': 'day', - 'time_value': datetime.date(2020, 5, 1), - 'direction': None, - 'issue': datetime.date(2020, 9, 3), - 'lag': 125, - 'missing_value': 0, - 'missing_stderr': 0, - 'missing_sample_size': 0, - 'value': 0.8254101, - 'stderr': 0.1360033, - 'sample_size': 1722.4551}, - {'geo_value': '01001', - 'signal': 'smoothed_cli', - 'source': 'fb-survey', - 'geo_type': 'county', - 'time_type': 'day', - 'time_value': datetime.date(2020, 5, 1), - 'direction': None, - 'issue': datetime.date(2020, 9, 3), - 'lag': 125, - 'missing_value': 0, - 'missing_stderr': 0, - 'missing_sample_size': 0, - 'value': 1.2994255, - 'stderr': 0.9671356, - 'sample_size': 115.8025}, - . - . - . - }] - -Note that all of the :ref:`output functions ` have a ``field`` parameter which takes in any form of iterator objects -to enable fetching the data with customization (e.g. specifying which fields or columns to output). Similar to the previous example, -to parse the data in JSON format, but customize the field to show only ``geo_value`` and ``value``, we would use the following -command: - ->>> data = apicall.json(fields = ['geo_value', 'value']) ->>> data -[{'geo_value': '01000', 'value': 0.8254101}, - {'geo_value': '01001', 'value': 1.2994255}, - {'geo_value': '01003', 'value': 0.6965968}, - {'geo_value': '01015', 'value': 0.4282713}, - {'geo_value': '01031', 'value': 0.0255788}, - {'geo_value': '01045', 'value': 1.0495589}, - {'geo_value': '01051', 'value': 1.5783991}, - {'geo_value': '01069', 'value': 1.6789546}, - {'geo_value': '01071', 'value': 2.1313118}, - . - . - . - }] - -**Wikipedia Access article "influenza" on 2020w01** - ->>> apicall_wiki = Epidata.wiki(articles='influenza', epiweeks='202001') ->>> data = apicall_wiki.json() ->>> print(data) -[{'article': 'influenza', 'count': 6516, 'total': 663604044, 'hour': -1, 'epiweek': datetime.date(2019, 12, 29), 'value': 9.81910834}] - -**FluView on 2019w01 (national)** - ->>> apicall_fluview = Epidata.fluview(regions='nat', epiweeks='201901') ->>> data = apicall_fluview.classic() ->>> data -{'epidata': [{'release_date': '2020-10-02', - 'region': 'nat', - 'issue': datetime.date(2020, 3, 9), - 'epiweek': datetime.date(2018, 12, 30), - 'lag': 90, - 'num_ili': 42135, - 'num_patients': 1160440, - 'num_providers': 2630, - 'num_age_0': 11686, - 'num_age_1': 9572, - 'num_age_2': None, - 'num_age_3': 11413, - 'num_age_4': 5204, - 'num_age_5': 4260, - 'wili': 3.45972, - 'ili': 3.63095}], - 'result': 1, - 'message': 'success'} diff --git a/docs/index.rst b/docs/index.rst index a046b74..f7f86a6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,32 +1,35 @@ +=============== epidatpy =============== This package provides Python access to the `Delphi Epidata API -`_ published by -the `Delphi research group `_ at `Carnegie Mellon University -`_. +`_ published by the `Delphi +research group `_ at `Carnegie Mellon University +`_. The package source code and bug tracker can be found +`on GitHub `_. -The package source code and bug tracker can be found `on GitHub -`_. +.. note :: **You should consider subscribing** to the `API mailing list + `_ to be + notified of package updates, new data sources, corrections, and other + updates. +See also the `CMU Delphi Terms of Use +`_, noting that the data +is a research product and not warranted for a particular purpose. Installation ------------- +=============== -This package will be available on PyPI as `epidatpy -`_ and will be installable with ``pip``. -Meanwhile, it can be installed from GitHub: +This package will soon be available on PyPI as `epidatpy +`_. Meanwhile, it can be installed from +GitHub: .. code-block:: sh pip install -e "git+https://github.com/cmu-delphi/epidatpy.git#egg=epidatpy" -The package requires `pandas `_ and `requests -`_; these should be installed -automatically. - API Keys --------- +=============== The Delphi Epidata API requires a (free) API key for full functionality. To generate your key, register for a pseudo-anonymous account `here @@ -34,43 +37,23 @@ generate your key, register for a pseudo-anonymous account `here discussion on the `general API website `_. The ``epidatpy`` client will automatically look for this key in the environment variable -``DELPHI_EPIDATA_KEY``. We recommend storing your key in a ``.env`` file and using +``DELPHI_EPIDATA_KEY``. We recommend storing your key in a ``.env`` file, using `python-dotenv `_ to load it into -your environment. - -Note that for the time being, the private endpoints (i.e. those prefixed with -``pvt``) will require a separate key that needs to be passed as an argument. +your environment, and adding ``.env`` to your ``.gitignore`` file. -See also the `COVIDcast Terms of Use -`_, noting that the data is a -research product and not warranted for a particular purpose. +Note that for the time being, the private endpoints (i.e. those prefixed with +``pvt``) will require additional permissions (contact us for more information). -For users of the covidcast Python package ------------------------------------------- - -The `covidcast `_ -package is deprecated and will no longer be updated. The ``epidatpy`` package is a -complete rewrite with a focus on speed, reliability, and ease of use. It also -supports more endpoints and data sources than ``covidcast``. When migrating from -that package, you will need to use the ``pub_covidcast`` function in -``epidatpy``. - -.. note :: **You should consider subscribing** to the `API mailing list - `_ to be - notified of package updates, new data sources, corrections, and other - updates. - -Contents --------- +Documentation Contents +=============== .. toctree:: - :maxdepth: 2 + :maxdepth: 1 getting_started - covidcast_examples + signal_discovery - signals_covid + versioned_data epidatpy - diff --git a/docs/signal_discovery.ipynb b/docs/signal_discovery.ipynb new file mode 100644 index 0000000..830b216 --- /dev/null +++ b/docs/signal_discovery.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finding data of interest\n", + "\n", + "The Epidata API includes numerous data streams -- medical claims data, cases and\n", + "deaths, mobility, and many others -- covering different geographic regions. This\n", + "can make it a challenge to find the data stream that you are most interested in.\n", + "This page will provide some advice on how to locate donate that may be useful to\n", + "you.\n", + "\n", + "## Using the Delphi Epidata API documentation\n", + "\n", + "The Delphi Epidata API documentation lists all the available data sources and\n", + "signals for\n", + "[COVID-19](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html)\n", + "and for [other\n", + "diseases](https://cmu-delphi.github.io/delphi-epidata/api/README.html#source-specific-parameters).\n", + "The site also includes a search tool if you have a keyword (e.g. \"Taiwan\") in\n", + "mind. Generally, any endpoint listed in the Delphi Epidata API has an associated\n", + "function in this client where its API endpoint name is prefixed with either\n", + "`pub_` or `pvt_`, e.g. `pub_covidcast` or `pvt_twitter`.\n", + "\n", + "## Epidata data sources\n", + "\n", + "The parameters available for each source data are documented in each linked\n", + "source-specific API page. The epidatpy client will also expect certain fields,\n", + "depending on the endpoint, though the Delphi Epidata API documentation will\n", + "contain more information about the accepted ranges of values for each field. \n", + "\n", + "A dynamically generated list of all available data sources can be obtained by\n", + "using the built-in `available_endpoints()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Hidden cell (set in the metadata for this cell)\n", + "import pandas as pd\n", + "\n", + "# Set common options and context\n", + "pd.set_option(\"display.max_columns\", None)\n", + "pd.set_option(\"display.max_rows\", 10)\n", + "pd.set_option(\"display.width\", 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import HTML\n", + "\n", + "from epidatpy import available_endpoints\n", + "\n", + "table = available_endpoints()\n", + "HTML(table.to_html(index=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Covidcast source and signal metadata\n", + "\n", + "The `CovidcastEpidata` class provides a way to access information about the data\n", + "in the `pub_covidcast` endpoint directly from within the client. The cell below\n", + "demonstrates how to access this metadata by using `source_df` property, which\n", + "returns a Pandas DataFrame of metadata describing all data streams publically\n", + "accessible from the COVIDcast endpoint of the Delphi Epidata API. This mirrors\n", + "the information found in the [COVIDcast signals\n", + "endpoint](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from epidatpy import CovidcastEpidata\n", + "\n", + "epidata = CovidcastEpidata()\n", + "epidata.source_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This DataFrame contains the following columns:\n", + "\n", + "- `source` - API-internal source name.\n", + "- `name` - Human-readable source name.\n", + "- `description` - Description of the signal.\n", + "- `reference_signal` - Geographic level for which this signal is available, such as county, state, msa, hss, hrr, or nation. Most signals are available at multiple geographic levels and will hence be listed in multiple rows with their own metadata.\n", + "- `license` - The license.\n", + "- `dua` - Link to the Data Use Agreement.\n", + "- `signals` - List of signals available from this data source.\n", + "\n", + "The `signal_df` DataFrame can also be used to obtain information about the signals\n", + "that are available - for example, what time range they are available for,\n", + "and when they have been updated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.signal_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This DataFrame contains one row each available signal, with the following columns:\n", + "\n", + "- `source` - Data source name.\n", + "- `signal` - API-internal signal name.\n", + "- `name` - Human-readable signal name.\n", + "- `active` - Whether the signal is currently not updated or not. Signals may be inactive because the sources have become unavailable, other sources have replaced them, or additional work is required for us to continue updating them.\n", + "- `short_description` - Brief description of the signal.\n", + "- `description` - Full description of the signal.\n", + "- `geo_types` - Spatial resolution of the signal (e.g., `county`, `hrr`, `msa`, `dma`, `state`). More detail about all `geo_types` is given in the [geographic coding documentation](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html).\n", + "- `time_type` - Temporal resolution of the signal (e.g., day, week; see [date coding details](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_times.html)).\n", + "- `time_label` - The time label (\"Date\", \"Week\").\n", + "- `value_label` - The value label (\"Value\", \"Percentage\", \"Visits\", \"Visits per 100,000 people\").\n", + "- `format` - The value format (\"per100k\", \"percent\", \"fraction\", \"count\", \"raw\").\n", + "- `category` - The signal category (\"early\", \"public\", \"late\", \"other\").\n", + "- `high_values_are`- What the higher value of signal indicates (\"good\", \"bad\", \"neutral\").\n", + "- `is_smoothed` - Whether the signal is smoothed.\n", + "- `is_weighted` - Whether the signal is weighted.\n", + "- `is_cumulative` - Whether the signal is cumulative.\n", + "- `has_stderr` - Whether the signal has `stderr` statistic.\n", + "- `has_sample_size` - Whether the signal has `sample_size` statistic.\n", + "- `geo_types` - Geographical levels for which this signal is available.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/signals_covid.rst b/docs/signals_covid.rst deleted file mode 100644 index 46d486b..0000000 --- a/docs/signals_covid.rst +++ /dev/null @@ -1,72 +0,0 @@ -Fetching Data -============= - ->>> from epidatpy.request import Epidata ->>> epi = Epidata() ->>> epi.pub_covidcast('usa-facts', 'confirmed_7dav_incidence_num', '20210101', '20210131', 'state', 'tx') - -This package provides various functions that can be called on the ``Epidata`` object to obtain any :ref:`Epidata endpoint ` signals of interest. The functions below are inherited by the ``Epidata`` object. - -Detailed examples are provided in the :ref:`usage examples `. - -COVIDcast Signals ------------------ - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_covidcast - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_covidcast_meta - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_covid_hosp_facility - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_covid_hosp_facility_lookup - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_covid_hosp_state_timeseries - -Other Epidata Signals ---------------------- - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_cdc - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_delphi - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_ecdc_ili - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_flusurv - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_fluview - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_fluview_meta - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_fluview_clinical - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_gft - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_ght - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_kcdc_ili - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_meta - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_nidss_flu - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_nowcast - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_quidel - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_sensors - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_twitter - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_wiki - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_dengue_nowcast - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_dengue_sensors - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_nidss_dengue - -.. automethod:: epidatpy.AEpiDataEndpoints.pub_paho_dengue - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_meta_norostat - -.. automethod:: epidatpy.AEpiDataEndpoints.pvt_norostat diff --git a/docs/versioned_data.ipynb b/docs/versioned_data.ipynb new file mode 100644 index 0000000..02b3504 --- /dev/null +++ b/docs/versioned_data.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Accessing versioned data\n", + "\n", + "The Delphi Epidata API stores not just each signal's estimate for a given\n", + "location on a given day, but also *when* that estimate was made, and all updates\n", + "to that estimate.\n", + "\n", + "For example, let's look at the [doctor visits\n", + "signal](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html)\n", + "from the [covidcast\n", + "endpoint](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html), which\n", + "estimates the percentage of outpatient doctor visits that are COVID-related.\n", + "\n", + "Consider a result row with `time_value = 2020-05-01` for `geo_values = \"pa\"`.\n", + "This is an estimate for Pennsylvania on May 1, 2020. That estimate was *issued*\n", + "on May 5, 2020 (which is recorded in the `issue` column), the delay coming from\n", + "a combination of:\n", + "\n", + "- time taken by our data partner to collect the data\n", + "- time taken by the Dekohu Epidata API to ingest the data provided.\n", + "\n", + "Later, the estimate for May 1st could be updated, perhaps because additional\n", + "visit data from May 1st arrived at our source and was reported to us. This\n", + "constitutes a new *issue* of the data.\n", + "\n", + "## Data known \"as of\" a specific date\n", + "\n", + "By default, endpoint functions fetch the most recent issue available. This is\n", + "the best option for users who simply want to graph the latest data or construct\n", + "dashboards. But if we are interested in knowing *when* data was reported, we can\n", + "request specific data versions using the `as_of`, `issues`, or `lag` arguments\n", + "(note that these are mutually exclusive and that not all endpoints aside from\n", + "`pub_covidcast` support all three parameters, so please check the documentation\n", + "for that specific endpoint).\n", + "\n", + "First, we can request the data that was available *as it was available* on a\n", + "specific date, using the `as_of` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Hidden cell (set in the metadata for this cell)\n", + "import pandas as pd\n", + "\n", + "# Set common options and context\n", + "pd.set_option(\"display.max_columns\", None)\n", + "pd.set_option(\"display.max_rows\", 10)\n", + "pd.set_option(\"display.width\", 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from epidatpy import EpiDataContext, EpiRange\n", + "\n", + "epidata = EpiDataContext(use_cache=False)\n", + "\n", + "# Obtain the most up-to-date version of the smoothed covid-like illness (CLI)\n", + "# signal from the COVID-19 Trends and Impact survey for the US\n", + "epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_cli\",\n", + " time_type=\"day\",\n", + " time_values=\"2020-05-01\",\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + " as_of=\"2020-05-07\",\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This shows that an estimate of about 2.3% was issued on May 7. If we don't\n", + "specify `as_of`, we get the most recent estimate available:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_cli\",\n", + " time_type=\"day\",\n", + " time_values=\"2020-05-01\",\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note the substantial change in the estimate, from less than 3% to over 5%,\n", + "reflecting new data that became available after May 7 about visits *occurring on*\n", + "May 1. This illustrates the importance of issue date tracking, particularly\n", + "for forecasting tasks. To backtest a forecasting model on past data, it is\n", + "important to use the data that would have been available *at the time* the model\n", + "was or would have been fit, not data that arrived much later.\n", + "\n", + "By plotting API results with different values of the `as_of` parameter, we can\n", + "see how the indicator value changes over time as new observations become available:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.rcParams[\"figure.dpi\"] = 300\n", + "\n", + "results = []\n", + "for as_of_date in [\"2020-05-07\", \"2020-05-14\", \"2020-05-21\", \"2020-05-28\"]:\n", + " apicall = epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_adj_cli\",\n", + " time_type=\"day\",\n", + " time_values=EpiRange(\"2020-04-20\", \"2020-04-27\"),\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + " as_of=as_of_date,\n", + " )\n", + "\n", + " results.append(apicall.df())\n", + "\n", + "final_df = pd.concat(results)\n", + "final_df[\"issue\"] = final_df[\"issue\"].dt.date\n", + "\n", + "fig, ax = plt.subplots(figsize=(6, 5))\n", + "ax.spines[\"right\"].set_visible(False)\n", + "ax.spines[\"left\"].set_visible(False)\n", + "ax.spines[\"top\"].set_visible(False)\n", + "\n", + "final_df.pivot_table(values=\"value\", index=\"time_value\", columns=\"issue\").plot(\n", + " xlabel=\"Date\", ylabel=\"CLI\", ax=ax, linewidth=1.5\n", + ")\n", + "\n", + "plt.title(\"Smoothed CLI from Doctor Visits\", fontsize=16)\n", + "plt.subplots_adjust(bottom=0.2)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multiple issues of observations\n", + "\n", + "By using the `issues` argument, we can request all issues in a certain time\n", + "period:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_adj_cli\",\n", + " time_type=\"day\",\n", + " time_values=\"2020-05-01\",\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + " issues=EpiRange(\"2020-05-01\", \"2020-05-15\"),\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This estimate was clearly updated many times as new data for May 1st arrived.\n", + "Note that these results include only data issued or updated between (inclusive)\n", + "2020-05-01 and 2020-05-15. If a value was first reported on 2020-04-15, and\n", + "never updated, a query for issues between 2020-05-01 and 2020-05-15 will not\n", + "include that value among its results. This view of the data is useful for\n", + "understanding the revision patterns in a signal and can be useful for nowcasting\n", + "(i.e. the practice of auto-correcting real-time estimates).\n", + "\n", + "## Observations issued with a specific lag\n", + "\n", + "Finally, we can use the `lag` argument to request only data reported with a\n", + "certain lag. For example, requesting a lag of 7 days fetches only data issued\n", + "exactly 7 days after the corresponding `time_value`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_adj_cli\",\n", + " time_type=\"day\",\n", + " time_values=EpiRange(\"2020-05-01\", \"2020-05-01\"),\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + " lag=7,\n", + ").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that though this query requested all values between 2020-05-01 and\n", + "2020-05-07, May 3rd and May 4th were *not* included in the results set. This is\n", + "because the query will only include a result for May 3rd if a value were issued\n", + "on May 10th (a 7-day lag), but in fact the value was not updated on that day:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epidata.pub_covidcast(\n", + " data_source=\"doctor-visits\",\n", + " signals=\"smoothed_adj_cli\",\n", + " time_type=\"day\",\n", + " time_values=\"2020-05-03\",\n", + " geo_type=\"state\",\n", + " geo_values=\"pa\",\n", + " issues=EpiRange(\"2020-05-09\", \"2020-05-15\"),\n", + ").df()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/epidatpy/__init__.py b/epidatpy/__init__.py index aa88fa0..5cc7c39 100644 --- a/epidatpy/__init__.py +++ b/epidatpy/__init__.py @@ -1,10 +1,10 @@ """Fetch data from Delphi's API.""" # Make the linter happy about the unused variables -__all__ = ["__version__", "EpiDataContext", "CovidcastEpidata", "EpiRange"] +__all__ = ["__version__", "available_endpoints", "EpiDataContext", "CovidcastEpidata", "EpiRange"] __author__ = "Delphi Research Group" from ._constants import __version__ from ._model import EpiRange -from .request import CovidcastEpidata, EpiDataContext +from .request import CovidcastEpidata, EpiDataContext, available_endpoints diff --git a/epidatpy/_covidcast.py b/epidatpy/_covidcast.py index 4b6557e..d341450 100644 --- a/epidatpy/_covidcast.py +++ b/epidatpy/_covidcast.py @@ -33,8 +33,7 @@ @dataclass class WebLink: - """represents a web link - """ + """represents a web link""" alt: str href: str @@ -42,8 +41,7 @@ class WebLink: @dataclass class DataSignalGeoStatistics: - """COVIDcast signal statistics - """ + """COVIDcast signal statistics""" min: float max: float @@ -72,7 +70,7 @@ def define_covidcast_fields() -> List[EpidataFieldInfo]: EpidataFieldInfo("lag", EpidataFieldType.int), EpidataFieldInfo("value", EpidataFieldType.float), EpidataFieldInfo("stderr", EpidataFieldType.float), - EpidataFieldInfo("sample_size", EpidataFieldType.int), + EpidataFieldInfo("sample_size", EpidataFieldType.float), EpidataFieldInfo("direction", EpidataFieldType.float), EpidataFieldInfo("missing_value", EpidataFieldType.int), EpidataFieldInfo("missing_stderr", EpidataFieldType.int), @@ -82,8 +80,7 @@ def define_covidcast_fields() -> List[EpidataFieldInfo]: @dataclass class DataSignal(Generic[CALL_TYPE]): - """represents a COVIDcast data signal - """ + """represents a COVIDcast data signal""" _create_call: Callable[[Mapping[str, Optional[EpiRangeParam]]], CALL_TYPE] @@ -195,8 +192,7 @@ def __call__( @dataclass class DataSource(Generic[CALL_TYPE]): - """represents a COVIDcast data source - """ + """represents a COVIDcast data source""" _create_call: InitVar[Callable[[Mapping[str, Optional[EpiRangeParam]]], CALL_TYPE]] @@ -247,8 +243,7 @@ def signal_df(self) -> DataFrame: @dataclass class CovidcastDataSources(Generic[CALL_TYPE]): - """COVIDcast data source helper. - """ + """COVIDcast data source helper.""" sources: Sequence[DataSource[CALL_TYPE]] _source_by_name: Dict[str, DataSource[CALL_TYPE]] = field(init=False, default_factory=dict) diff --git a/epidatpy/_endpoints.py b/epidatpy/_endpoints.py index 3646c1a..bd42bd8 100644 --- a/epidatpy/_endpoints.py +++ b/epidatpy/_endpoints.py @@ -36,8 +36,7 @@ def get_wildcard_equivalent_dates(time_value: EpiRangeParam, time_type: Literal[ class AEpiDataEndpoints(ABC, Generic[CALL_TYPE]): - """epidata endpoint list and fetcher - """ + """epidata endpoint list and fetcher""" @abstractmethod def _create_call( diff --git a/epidatpy/request.py b/epidatpy/request.py index 26684bc..a8c9f05 100644 --- a/epidatpy/request.py +++ b/epidatpy/request.py @@ -1,3 +1,4 @@ +import inspect from os import environ from typing import ( Any, @@ -291,3 +292,10 @@ def create_call( ) return CovidcastDataSources.create(meta_data, create_call) + + +def available_endpoints() -> DataFrame: + """Get a DataFrame of available endpoints and their descriptions.""" + endpoints = [x for x in inspect.getmembers(AEpiDataEndpoints) if x[0].startswith("pvt_") or x[0].startswith("pub_")] + data = {e[0]: e[1].__doc__.split("\n")[0] if e[1].__doc__ else "None" for e in endpoints} + return DataFrame(data.items(), columns=["Endpoint", "Description"]) diff --git a/pyproject.toml b/pyproject.toml index 8321b5c..1ba68ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,10 @@ dependencies = [ [project.optional-dependencies] dev = [ + "ipykernel", + "matplotlib", "mypy", + "nbsphinx", "pylint", "pytest", "recommonmark",