diff --git a/.gitignore b/.gitignore index a5df42b..377ce2b 100644 --- a/.gitignore +++ b/.gitignore @@ -170,3 +170,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.vscode/ diff --git a/Makefile b/Makefile index 58ec83b..d743dd7 100644 --- a/Makefile +++ b/Makefile @@ -106,8 +106,9 @@ create_tests: .PHONY: get_data get_data: mkdir -p data/raw - wget -O data/raw/Practice_Level_Crosstab_Sep_24.zip https://files.digital.nhs.uk/A5/B4AB19/Practice_Level_Crosstab_Sep_24.zip - unzip -o data/raw/Practice_Level_Crosstab_Sep_24.zip -d data/raw + wget -O data/raw/_data.zip https://files.digital.nhs.uk/A5/B4AB19/Practice_Level_Crosstab_Sep_24.zip + unzip -o data/raw/_data.zip -d data/raw + rm data/raw/_data.zip ################################################################################# @@ -152,4 +153,4 @@ endef export PRINT_HELP_PYSCRIPT help: - @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST) + @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST) \ No newline at end of file diff --git a/code_your_own_pandas_pipeline/aggregations.py b/code_your_own_pandas_pipeline/aggregations.py new file mode 100644 index 0000000..06d99dc --- /dev/null +++ b/code_your_own_pandas_pipeline/aggregations.py @@ -0,0 +1,65 @@ +""" +This modules provides function to pivot and summarize the practice level appointment data. +""" + +import pandas as pd +from loguru import logger + +placeholder_df = pd.DataFrame() + + +def pivot_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Pivot the practice level data. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The pivoted data. + """ + logger.info("Pivoting the practice level data.") + + logger.warning("This function is not yet implemented.") + + +def summarize_monthly_gp_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Summarize the monthly appointments by GP and Appointment Status. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The summarized data. + """ + logger.info("Summarizing the monthly GP appointments.") + + logger.warning("This function is not yet implemented.") + + +def summarize_monthly_region_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Summarize the monthly appointments by Region and Appointment Status. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The summarized data. + """ + logger.info("Summarizing the monthly region appointments.") + + logger.warning("This function is not yet implemented.") diff --git a/code_your_own_pandas_pipeline/config.py b/code_your_own_pandas_pipeline/config.py new file mode 100644 index 0000000..a7a1544 --- /dev/null +++ b/code_your_own_pandas_pipeline/config.py @@ -0,0 +1,36 @@ +""" +Configuration file for the code_your_own_pandas_pipeline package. +""" + +from pathlib import Path + +from dotenv import load_dotenv +from loguru import logger + +# Load environment variables from .env file if it exists +load_dotenv() + +# Paths +PROJ_ROOT = Path(__file__).resolve().parents[1] +logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") + +DATA_DIR = (PROJ_ROOT / "data").relative_to(PROJ_ROOT) +RAW_DATA_DIR = DATA_DIR / "raw" +INTERIM_DATA_DIR = DATA_DIR / "interim" +PROCESSED_DATA_DIR = DATA_DIR / "processed" +EXTERNAL_DATA_DIR = DATA_DIR / "external" + +MODELS_DIR = PROJ_ROOT / "models" + +REPORTS_DIR = PROJ_ROOT / "reports" +FIGURES_DIR = REPORTS_DIR / "figures" + +# If tqdm is installed, configure loguru with tqdm.write +# https://github.com/Delgan/loguru/issues/135 +try: + from tqdm import tqdm + + logger.remove(0) + logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) +except ModuleNotFoundError: + pass diff --git a/code_your_own_pandas_pipeline/data_in.py b/code_your_own_pandas_pipeline/data_in.py new file mode 100644 index 0000000..0212611 --- /dev/null +++ b/code_your_own_pandas_pipeline/data_in.py @@ -0,0 +1,35 @@ +""" +This module contains the function to read the mapping and practice crosstab data from the data +folder. +""" + +import pandas as pd +from loguru import logger + + +def read_mapping_data() -> pd.DataFrame: + """ + Read the mapping data from the data folder. + + Returns + ------- + pd.DataFrame + The mapping data. + """ + logger.info(f"Reading mapping data from {""}") + + logger.warning("This function is not yet implemented.") + + +def read_practice_crosstab_data() -> pd.DataFrame: + """ + Read the practice crosstab data from the data folder. + + Returns + ------- + pd.DataFrame + The practice crosstab data. + """ + logger.info(f"Reading practice crosstab data from {""}") + + logger.warning("This function is not yet implemented.") diff --git a/code_your_own_pandas_pipeline/pipeline.py b/code_your_own_pandas_pipeline/pipeline.py new file mode 100644 index 0000000..64b8d2e --- /dev/null +++ b/code_your_own_pandas_pipeline/pipeline.py @@ -0,0 +1,41 @@ +""" +Main pipeline for the code_your_own_pandas_pipeline package. +""" + +import pandas as pd +from loguru import logger + +from code_your_own_pandas_pipeline import aggregations, data_in, plots, processing + +placeholder_df = pd.DataFrame() + + +def main() -> None: + """ + Main function to run the pipeline. + + Returns + ------- + None + """ + logger.level("START", no=15, color="") + logger.log("START", "Starting the GP Appointment Data Pipeline") + + data_in.read_mapping_data() + data_in.read_practice_crosstab_data() + + processing.tidy_practice_level_data(placeholder_df) + processing.merge_mapping_and_practice_data(placeholder_df, placeholder_df) + + aggregations.pivot_practice_level_data(placeholder_df) + aggregations.summarize_monthly_gp_appointments(placeholder_df) + aggregations.summarize_monthly_region_appointments(placeholder_df) + + plots.plot_monthly_gp_appointments(placeholder_df, "placeholder_str") + plots.plot_monthly_region_appointments(placeholder_df, "placeholder_str") + + logger.success("GP Appointment Data Pipeline Completed") + + +if __name__ == "__main__": + main() diff --git a/code_your_own_pandas_pipeline/plots.py b/code_your_own_pandas_pipeline/plots.py new file mode 100644 index 0000000..cf4d9cd --- /dev/null +++ b/code_your_own_pandas_pipeline/plots.py @@ -0,0 +1,72 @@ +""" +This module provides function for generating and saving plots. +""" + +import pandas as pd +from loguru import logger + + +def save_plot(plot, output_folder: str, plot_name: str) -> None: + """ + Save the plot to the output folder. + + Parameters + ---------- + plot : matplotlib.pyplot + The plot to save. + output_folder : str + The output folder to save the plot. + plot_name : str + The plot name. + + Returns + ------- + None + """ + logger.info(f"Saving the plot {plot_name} to {output_folder}.") + + logger.warning("This function is not yet implemented.") + + +def plot_monthly_gp_appointments( + monthly_gp_appointments: pd.DataFrame, output_folder: str +) -> None: + """ + Plot the monthly GP appointments. + + Parameters + ---------- + monthly_gp_appointments : pd.DataFrame + The monthly GP appointments data. + output_folder : str + The output folder to save the plots. + + Returns + ------- + None + """ + logger.info("Plotting the monthly GP appointments.") + + logger.warning("This function is not yet implemented.") + + +def plot_monthly_region_appointments( + monthly_region_appointments: pd.DataFrame, output_folder: str +) -> None: + """ + Plot the monthly region appointments. + + Parameters + ---------- + monthly_region_appointments : pd.DataFrame + The monthly region appointments data. + output_folder : str + The output folder to save the plots. + + Returns + ------- + None + """ + logger.info("Plotting the monthly region appointments.") + + logger.warning("This function is not yet implemented.") diff --git a/code_your_own_pandas_pipeline/processing.py b/code_your_own_pandas_pipeline/processing.py new file mode 100644 index 0000000..bac11f3 --- /dev/null +++ b/code_your_own_pandas_pipeline/processing.py @@ -0,0 +1,50 @@ +""" +This module contains the functions to process the mapping and practice crosstab data and merge them. +""" + +import pandas as pd +from loguru import logger + +placeholder_df = pd.DataFrame() + + +def tidy_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Tidy the practice crosstab data. + + Parameters + ---------- + practice_crosstab : pd.DataFrame + The practice crosstab data. + + Returns + ------- + pd.DataFrame + The tidy practice crosstab data. + """ + logger.info("Tidying the practice crosstab data.") + + logger.warning("This function is not yet implemented.") + + +def merge_mapping_and_practice_data( + mapping_data: pd.DataFrame, practice_data: pd.DataFrame +) -> pd.DataFrame: + """ + Merge the mapping and practice data. + + Parameters + ---------- + mapping_data : pd.DataFrame + The mapping data. + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The merged data. + """ + logger.info("Merging the mapping and practice data.") + + logger.warning("This function is not yet implemented.") diff --git a/tests/unittests/test_aggregations.py b/tests/unittests/test_aggregations.py new file mode 100644 index 0000000..beb45f1 --- /dev/null +++ b/tests/unittests/test_aggregations.py @@ -0,0 +1,212 @@ +""" +Tests for code_your_own_pandas_pipeline.aggregations +""" + +import pandas as pd +import pytest + +from code_your_own_pandas_pipeline.aggregations import ( + pivot_practice_level_data, + summarize_monthly_gp_appointments, + summarize_monthly_region_appointments, +) + + +@pytest.fixture +def practice_test_data(): + return pd.DataFrame( + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "REGION_NAME", + "APPT_STATUS", + "COUNT_OF_APPOINTMENTS", + ], + data=[ + ["2021-01-01", "Example GP A", "REGION1", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION1", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION1", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION1", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION2", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION2", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION2", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION2", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION1", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION1", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION1", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION1", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION2", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION2", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION2", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION2", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION1", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION1", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION1", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION1", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION2", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION2", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION2", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION2", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION1", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION1", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION1", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION1", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION2", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION2", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION2", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION2", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION1", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION1", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION1", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION1", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION2", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION2", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION2", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION2", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION1", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION1", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION1", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION1", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION2", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION2", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION2", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION2", "UNKNOWN", 12], + ], + ) + + +@pytest.fixture +def practice_pivot_test_data(): + return pd.DataFrame( + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "REGION_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + data=[ + ["2021-01-01", "Example GP A", "REGION1", 1, 2, 3], + ["2021-01-01", "Example GP B", "REGION1", 4, 5, 6], + ["2021-02-01", "Example GP A", "REGION1", 7, 8, 9], + ["2021-02-01", "Example GP B", "REGION1", 10, 11, 12], + ["2021-01-01", "Example GP A", "REGION2", 1, 2, 3], + ["2021-01-01", "Example GP B", "REGION2", 4, 5, 6], + ["2021-02-01", "Example GP A", "REGION2", 7, 8, 9], + ["2021-02-01", "Example GP B", "REGION2", 10, 11, 12], + ] + * 2, + ) + + +class TestPivotPracticeLevelData: + """ + Tests for the pivot_practice_level_data function. + """ + + def test_returns_dataframe(self, practice_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = pivot_practice_level_data(practice_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = pivot_practice_level_data(practice_test_data) + assert not actual.empty + + def test_return_pivoted_data(self, practice_test_data): + """ + Check that the function returns the pivoted data. + """ + actual = pivot_practice_level_data(practice_test_data, practice_pivot_test_data) + expected = practice_pivot_test_data + assert actual.assert_frame_equal(expected) + + +class TestSummarizeMonthlyGPAppointments: + """ + Tests for the summarize_monthly_gp_appointments function. + """ + + def test_returns_dataframe(self, practice_pivot_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_pivot_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + assert not actual.empty + + def test_return_summarized_data(self, practice_pivot_test_data): + """ + Check that the function returns the summarized data. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + expected = pd.DataFrame( + [ + ["2021-01-01", "Example GP A", 4, 8, 12], + ["2021-01-01", "Example GP B", 16, 20, 24], + ["2021-02-01", "Example GP A", 28, 32, 36], + ["2021-02-01", "Example GP B", 40, 44, 48], + ], + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + ) + assert actual.assert_frame_equal(expected) + + +class TestSummarizeMonthlyRegionAppointments: + """ + Tests for the summarize_monthly_region_appointments function. + """ + + def test_returns_dataframe(self, practice_pivot_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_pivot_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + assert not actual.empty + + def test_return_summarized_data(self, practice_pivot_test_data): + """ + Check that the function returns the summarized data. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + expected = pd.DataFrame( + [ + ["2021-01-01", "Example GP A", 4, 8, 12], + ["2021-01-01", "Example GP B", 16, 20, 24], + ["2021-02-01", "Example GP A", 28, 32, 36], + ["2021-02-01", "Example GP B", 40, 44, 48], + ], + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + ) + assert actual.assert_frame_equal(expected) diff --git a/tests/unittests/test_data_in.py b/tests/unittests/test_data_in.py new file mode 100644 index 0000000..cbb8b08 --- /dev/null +++ b/tests/unittests/test_data_in.py @@ -0,0 +1,118 @@ +""" +Tests for code_your_own_pandas_pipeline.data_in +""" +import pandas as pd +import numpy as np + +from code_your_own_pandas_pipeline.data_in import read_mapping_data, read_practice_crosstab_data + + +class TestReadMappingData: + """ + Tests for the read_mapping_data function. + """ + + def test_return_type(self): + """ + Test that the read_mapping_data function returns a pandas DataFrame. + """ + actual = read_mapping_data() + + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self): + """ + Tests that the read_mapping_data function returns a non-empty DataFrame + """ + actual = read_mapping_data() + + assert not actual.empty + + def test_shape(self): + """ + Tests that the read_mapping_data function returns a DataFrame with the correct shape. + """ + actual = read_mapping_data() + + assert actual.shape == (6241, 11) + + def test_schema(self): + """ + Tests that the read_mapping_data function returns a DataFrame with the correct schema. + """ + actual = read_mapping_data() + + expected_schema = pd.Series( + { + "GP_CODE": np.dtype("O"), + "GP_NAME": np.dtype("O"), + "SUPPLIER": np.dtype("O"), + "PCN_CODE": np.dtype("O"), + "PCN_NAME": np.dtype("O"), + "SUB_ICB_LOCATION_CODE": np.dtype("O"), + "SUB_ICB_LOCATION_NAME": np.dtype("O"), + "ICB_CODE": np.dtype("O"), + "ICB_NAME": np.dtype("O"), + "REGION_CODE": np.dtype("O"), + "REGION_NAME": np.dtype("O"), + } + ) + + assert expected_schema.assert_series_equal(actual.dtypes) + + +class TestReadPracticeCrosstabData: + """ + Tests for the read_practice_crosstab_data function. + """ + + def test_return_type(self): + """ + Test that the read_practice_crosstab_data function returns a pandas DataFrame. + """ + actual = read_practice_crosstab_data() + + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self): + """ + Tests that the read_practice_crosstab_data function returns a non-empty DataFrame + """ + actual = read_practice_crosstab_data() + + assert not actual.empty + + def test_shape(self): + """ + Tests that the read_practice_crosstab_data function returns a DataFrame with the correct shape. + """ + actual = read_practice_crosstab_data() + + assert actual.shape == (2971190, 14) + + def test_schema(self): + """ + Tests that the read_practice_crosstab_data function returns a DataFrame with the correct schema. + """ + actual = read_practice_crosstab_data() + + expected_schema = pd.Series( + { + "APPOINTMENT_MONTH_START_DATE": np.dtype("O"), + "GP_CODE": np.dtype("O"), + "GP_NAME": np.dtype("O"), + "SUPPLIER": np.dtype("O"), + "PCN_CODE": np.dtype("O"), + "PCN_NAME": np.dtype("O"), + "SUB_ICB_LOCATION_CODE": np.dtype("O"), + "SUB_ICB_LOCATION_NAME": np.dtype("O"), + "HCP_TYPE": np.dtype("O"), + "APPT_MODE": np.dtype("O"), + "NATIONAL_CATEGORY": np.dtype("O"), + "TIME_BETWEEN_BOOK_AND_APPT": np.dtype("O"), + "COUNT_OF_APPOINTMENTS": np.dtype("int64"), + "APPT_STATUS": np.dtype("O"), + } + ) + + assert expected_schema.assert_series_equal(actual.dtypes) diff --git a/tests/unittests/test_pipeline.py b/tests/unittests/test_pipeline.py new file mode 100644 index 0000000..59c441a --- /dev/null +++ b/tests/unittests/test_pipeline.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.pipeline +""" +import pytest + +import code_your_own_pandas_pipeline.pipeline + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True diff --git a/tests/unittests/test_plots.py b/tests/unittests/test_plots.py new file mode 100644 index 0000000..c4d1fed --- /dev/null +++ b/tests/unittests/test_plots.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.plots +""" +import pytest + +import code_your_own_pandas_pipeline.plots + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True diff --git a/tests/unittests/test_processing.py b/tests/unittests/test_processing.py new file mode 100644 index 0000000..243dbe9 --- /dev/null +++ b/tests/unittests/test_processing.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.processing +""" +import pytest + +import code_your_own_pandas_pipeline.processing + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True