cmu-delphi
diff --git a/‎nwss_wastewater/delphi_nwss/pull.py‎
Lines changed: 54 additions & 51 deletions b/‎nwss_wastewater/delphi_nwss/pull.py‎
Lines changed: 54 additions & 51 deletions
diff --git a/‎nwss_wastewater/delphi_nwss/run.py‎
Lines changed: 1 addition & 2 deletions b/‎nwss_wastewater/delphi_nwss/run.py‎
Lines changed: 1 addition & 2 deletions
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 """Functions for pulling NCHS mortality data API."""
 
-from typing import Optional
-
 import numpy as np
 import pandas as pd
 from sodapy import Socrata
@@ -32,7 +30,48 @@ def sig_digit_round(value, n_digits):
     return result
 
 
-def pull_nwss_data(token: str, test_file: Optional[str] = None):
+def construct_typedicts():
+    """Create the type conversion dictionary for both dataframes."""
+    # basic type conversion
+    signals_dict = {key: float for key in SIGNALS}
+    type_dict = {**signals_dict}
+    type_dict["timestamp"] = "datetime64[ns]"
+    # metric type conversion
+    signals_dict_metric = {key: float for key in METRIC_SIGNALS}
+    metric_dates_dict = {key: "datetime64[ns]" for key in METRIC_DATES}
+    type_dict_metric = {**metric_dates_dict, **signals_dict_metric, **SAMPLE_SITE_NAMES}
+    return type_dict, type_dict_metric
+
+
+def warn_string(df, type_dict):
+    """Format the warning string."""
+    return f"""
+Expected column(s) missed, The dataset schema may
+have changed. Please investigate and amend the code.
+
+Columns needed:
+{NEWLINE.join(type_dict.keys())}
+
+Columns available:
+{NEWLINE.join(df.columns)}
+"""
+
+
+def reformat(df, df_metric):
+    """Add the population column from df_metric to df, and rename some columns."""
+    # drop unused columns from df_metric
+    df_population = df_metric.loc[:, ["key_plot_id", "date_start", "population_served"]]
+    # get matching keys
+    df_population = df_population.rename(columns={"date_start": "timestamp"})
+    df_population = df_population.set_index(["key_plot_id", "timestamp"])
+    df = df.set_index(["key_plot_id", "timestamp"])
+
+    df = df.join(df_population)
+    df = df.reset_index()
+    return df
+
+
+def pull_nwss_data(token: str):
     """Pull the latest NWSS Wastewater data, and conforms it into a dataset.
 
     The output dataset has:
@@ -55,54 +94,26 @@ def pull_nwss_data(token: str, test_file: Optional[str] = None):
     """
     # Constants
     keep_columns = SIGNALS.copy()
-    signals_dict_metric = {key: float for key in METRIC_SIGNALS}
-    metric_dates_dict = {key: "datetime64[ns]" for key in METRIC_DATES}
-    type_dict_metric = {**metric_dates_dict, **signals_dict_metric, **SAMPLE_SITE_NAMES}
     # concentration key types
-    signals_dict = {key: float for key in SIGNALS}
-    type_dict = {**signals_dict}
-    type_dict["timestamp"] = "datetime64[ns]"
+    type_dict, type_dict_metric = construct_typedicts()
+
+    # Pull data from Socrata API
+    client = Socrata("data.cdc.gov", token)
+    results_concentration = client.get("g653-rqe2", limit=10**10)
+    results_metric = client.get("2ew6-ywp6", limit=10**10)
+    df_metric = pd.DataFrame.from_records(results_metric)
+    df = pd.DataFrame.from_records(results_concentration)
+    df = df.rename(columns={"date": "timestamp"})
 
-    if test_file:
-        df = pd.read_csv(f"./test_data/{test_file}")
-    else:
-        # Pull data from Socrata API
-        client = Socrata("data.cdc.gov", token)
-        results_concentration = client.get("g653-rqe2", limit=10**10)
-        results_metric = client.get("2ew6-ywp6", limit=10**10)
-        df_metric = pd.DataFrame.from_records(results_metric)
-        df = pd.DataFrame.from_records(results_concentration)
-        df = df.rename(columns={"date": "timestamp"})
     try:
         df = df.astype(type_dict)
     except KeyError as exc:
-        raise ValueError(
-            f"""
-Expected column(s) missed, The dataset schema may
-have changed. Please investigate and amend the code.
-
-Columns needed:
-{NEWLINE.join(type_dict.keys())}
+        raise ValueError(warn_string(df, type_dict)) from exc
 
-Columns available:
-{NEWLINE.join(df.columns)}
-"""
-        ) from exc
     try:
         df_metric = df_metric.astype(type_dict_metric)
     except KeyError as exc:
-        raise ValueError(
-            f"""
-Expected column(s) missed, The metric dataset schema may
-have changed. Please investigate and amend the code.
-
-Columns needed:
-{NEWLINE.join(type_dict_metric.keys())}
-
-Columns available:
-{NEWLINE.join(df_metric.columns)}
-"""
-        ) from exc
+        raise ValueError(warn_string(df_metric, type_dict_metric)) from exc
 
     # pull 2 letter state labels out of the key_plot_id labels
     df["state"] = df.key_plot_id.str.extract(r"_(\w\w)_")
@@ -111,15 +122,7 @@ def pull_nwss_data(token: str, test_file: Optional[str] = None):
     for signal in SIGNALS:
         df[signal] = sig_digit_round(df[signal], SIG_DIGITS)
 
-    # drop unused columns from df_metric
-    df_population = df_metric.loc[:, ["key_plot_id", "date_start", "population_served"]]
-    # get matching keys
-    df_population = df_population.rename(columns={"date_start": "timestamp"})
-    df_population = df_population.set_index(["key_plot_id", "timestamp"])
-    df = df.set_index(["key_plot_id", "timestamp"])
-
-    df = df.join(df_population)
-    df = df.reset_index()
+    df = reformat(df, df_metric)
     # if there are population NA's, assume the previous value is accurate (most
     # likely introduced by dates only present in one and not the other; even
     # otherwise, best to assume some value rather than break the data)
 
@@ -126,7 +126,6 @@ def run_module(params):
     )
     export_dir = params["common"]["export_dir"]
     token = params["indicator"]["token"]
-    test_file = params["indicator"].get("test_file", None)
     if "archive" in params:
         daily_arch_diff = S3ArchiveDiffer(
             params["archive"]["cache_dir"],
@@ -140,7 +139,7 @@ def run_module(params):
     run_stats = []
     ## build the base version of the signal at the most detailed geo level you can get.
     ## compute stuff here or farm out to another function or file
-    df_pull = pull_nwss_data(token, test_file)
+    df_pull = pull_nwss_data(token)
     ## aggregate
     for sensor in SIGNALS:
         df = df_pull.copy()