From c4031dd535101ae22058375d34f385a24636acb8 Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 18:32:27 +0000 Subject: [PATCH 1/6] Adds .vscode/ to .gitignore to exclude Visual Studio Code workspace settings from version control --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a5df42b..377ce2b 100644 --- a/.gitignore +++ b/.gitignore @@ -170,3 +170,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.vscode/ From 4ae5bfcf846d3d89b57500e8d30f8d632ea088c7 Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 18:32:34 +0000 Subject: [PATCH 2/6] Adds initial implementation of data processing pipeline with modules for data input, aggregation, processing, and plotting --- code_your_own_pandas_pipeline/aggregations.py | 71 ++++++++++++++++++ code_your_own_pandas_pipeline/config.py | 36 ++++++++++ code_your_own_pandas_pipeline/data_in.py | 45 ++++++++++++ code_your_own_pandas_pipeline/pipeline.py | 40 +++++++++++ code_your_own_pandas_pipeline/plots.py | 72 +++++++++++++++++++ code_your_own_pandas_pipeline/processing.py | 54 ++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 code_your_own_pandas_pipeline/aggregations.py create mode 100644 code_your_own_pandas_pipeline/config.py create mode 100644 code_your_own_pandas_pipeline/data_in.py create mode 100644 code_your_own_pandas_pipeline/pipeline.py create mode 100644 code_your_own_pandas_pipeline/plots.py create mode 100644 code_your_own_pandas_pipeline/processing.py diff --git a/code_your_own_pandas_pipeline/aggregations.py b/code_your_own_pandas_pipeline/aggregations.py new file mode 100644 index 0000000..6397cc4 --- /dev/null +++ b/code_your_own_pandas_pipeline/aggregations.py @@ -0,0 +1,71 @@ +""" +This modules provides function to pivot and summarize the practice level appointment data. +""" + +import pandas as pd +from loguru import logger + +placeholder_df = pd.DataFrame() + + +def pivot_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Pivot the practice level data. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The pivoted data. + """ + logger.info("Pivoting the practice level data.") + + logger.warning("This function is not yet implemented.") + + return placeholder_df + + +def summarize_monthly_gp_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Summarize the monthly appointments by GP and Appointment Status. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The summarized data. + """ + logger.info("Summarizing the monthly GP appointments.") + + logger.warning("This function is not yet implemented.") + + return placeholder_df + + +def summarize_monthly_region_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Summarize the monthly appointments by Region and Appointment Status. + + Parameters + ---------- + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The summarized data. + """ + logger.info("Summarizing the monthly region appointments.") + + logger.warning("This function is not yet implemented.") + + return placeholder_df diff --git a/code_your_own_pandas_pipeline/config.py b/code_your_own_pandas_pipeline/config.py new file mode 100644 index 0000000..a7a1544 --- /dev/null +++ b/code_your_own_pandas_pipeline/config.py @@ -0,0 +1,36 @@ +""" +Configuration file for the code_your_own_pandas_pipeline package. +""" + +from pathlib import Path + +from dotenv import load_dotenv +from loguru import logger + +# Load environment variables from .env file if it exists +load_dotenv() + +# Paths +PROJ_ROOT = Path(__file__).resolve().parents[1] +logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") + +DATA_DIR = (PROJ_ROOT / "data").relative_to(PROJ_ROOT) +RAW_DATA_DIR = DATA_DIR / "raw" +INTERIM_DATA_DIR = DATA_DIR / "interim" +PROCESSED_DATA_DIR = DATA_DIR / "processed" +EXTERNAL_DATA_DIR = DATA_DIR / "external" + +MODELS_DIR = PROJ_ROOT / "models" + +REPORTS_DIR = PROJ_ROOT / "reports" +FIGURES_DIR = REPORTS_DIR / "figures" + +# If tqdm is installed, configure loguru with tqdm.write +# https://github.com/Delgan/loguru/issues/135 +try: + from tqdm import tqdm + + logger.remove(0) + logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) +except ModuleNotFoundError: + pass diff --git a/code_your_own_pandas_pipeline/data_in.py b/code_your_own_pandas_pipeline/data_in.py new file mode 100644 index 0000000..e3df794 --- /dev/null +++ b/code_your_own_pandas_pipeline/data_in.py @@ -0,0 +1,45 @@ +""" +This module contains the function to read the mapping and practice crosstab data from the data +folder. +""" +import pandas as pd +from loguru import logger + +placeholder_df = pd.DataFrame() + + +def read_mapping_data() -> pd.DataFrame: + """ + Read the mapping data from the data folder. + + Returns + ------- + pd.DataFrame + The mapping data. + """ + logger.info(f"Reading mapping data from {""}") + + logger.warning("This function is not yet implemented.") + + return placeholder_df + + +def read_practice_crosstab_data() -> pd.DataFrame: + """ + Read the practice crosstab data from the data folder. + + Returns + ------- + pd.DataFrame + The practice crosstab data. + """ + logger.info(f"Reading practice crosstab data from {""}") + + logger.warning("This function is not yet implemented.") + + return placeholder_df + + +if __name__ == "__main__": + read_mapping_data().head() + read_practice_crosstab_data().head() diff --git a/code_your_own_pandas_pipeline/pipeline.py b/code_your_own_pandas_pipeline/pipeline.py new file mode 100644 index 0000000..baaf377 --- /dev/null +++ b/code_your_own_pandas_pipeline/pipeline.py @@ -0,0 +1,40 @@ +""" +Main pipeline for the code_your_own_pandas_pipeline package. +""" + +from loguru import logger + +from code_your_own_pandas_pipeline import aggregations, config, data_in, plots, processing + + +placeholder_df = pd.DataFrame() + + +def main() -> None: + """ + Main function to run the pipeline. + + Returns + ------- + None + """ + logger.level("START", no=15, color="") + logger.log("START", "Starting the GP Appointment Data Pipeline") + + data_in.read_mapping_data() + data_in.read_practice_crosstab_data() + + processing.tidy_practice_level_data(placeholder_df) + processing.merge_mapping_and_practice_data(placeholder_df, placeholder_df) + + aggregations.pivot_practice_level_data(placeholder_df) + aggregations.summarize_monthly_gp_appointments(placeholder_df) + aggregations.summarize_monthly_region_appointments(placeholder_df) + + plots.plot_monthly_gp_appointments(placeholder_df, "placeholder_str") + plots.plot_monthly_region_appointments(placeholder_df, "placeholder_str") + + logger.success("GP Appointment Data Pipeline Completed") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code_your_own_pandas_pipeline/plots.py b/code_your_own_pandas_pipeline/plots.py new file mode 100644 index 0000000..faa77fd --- /dev/null +++ b/code_your_own_pandas_pipeline/plots.py @@ -0,0 +1,72 @@ +""" +This module provides function for generating and saving plots. +""" + +import pandas as pd +from loguru import logger + + +def save_plot(plot, output_folder: str, plot_name: str) -> None: + """ + Save the plot to the output folder. + + Parameters + ---------- + plot : matplotlib.pyplot + The plot to save. + output_folder : str + The output folder to save the plot. + plot_name : str + The plot name. + + Returns + ------- + None + """ + logger.info(f"Saving the plot {plot_name} to {output_folder}.") + + logger.warning("This function is not yet implemented.") + + +def plot_monthly_gp_appointments( + monthly_gp_appointments: pd.DataFrame, output_folder: str +) -> None: + """ + Plot the monthly GP appointments. + + Parameters + ---------- + monthly_gp_appointments : pd.DataFrame + The monthly GP appointments data. + output_folder : str + The output folder to save the plots. + + Returns + ------- + None + """ + logger.info("Plotting the monthly GP appointments.") + + logger.warning("This function is not yet implemented.") + + +def plot_monthly_region_appointments( + monthly_region_appointments: pd.DataFrame, output_folder: str +) -> None: + """ + Plot the monthly region appointments. + + Parameters + ---------- + monthly_region_appointments : pd.DataFrame + The monthly region appointments data. + output_folder : str + The output folder to save the plots. + + Returns + ------- + None + """ + logger.info("Plotting the monthly region appointments.") + + logger.warning("This function is not yet implemented.") \ No newline at end of file diff --git a/code_your_own_pandas_pipeline/processing.py b/code_your_own_pandas_pipeline/processing.py new file mode 100644 index 0000000..87b77f0 --- /dev/null +++ b/code_your_own_pandas_pipeline/processing.py @@ -0,0 +1,54 @@ +""" +This module contains the functions to process the mapping and practice crosstab data and merge them. +""" + +import pandas as pd +from loguru import logger + +placeholder_df = pd.DataFrame() + + +def tidy_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: + """ + Tidy the practice crosstab data. + + Parameters + ---------- + practice_crosstab : pd.DataFrame + The practice crosstab data. + + Returns + ------- + pd.DataFrame + The tidy practice crosstab data. + """ + logger.info("Tidying the practice crosstab data.") + + logger.warning("This function is not yet implemented.") + + return placeholder_df + + +def merge_mapping_and_practice_data( + mapping_data: pd.DataFrame, practice_data: pd.DataFrame +) -> pd.DataFrame: + """ + Merge the mapping and practice data. + + Parameters + ---------- + mapping_data : pd.DataFrame + The mapping data. + practice_data : pd.DataFrame + The practice data. + + Returns + ------- + pd.DataFrame + The merged data. + """ + logger.info("Merging the mapping and practice data.") + + logger.warning("This function is not yet implemented.") + + return placeholder_df From 7b9ce53873d41c0d13b2caf9cd79c82723068128 Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 18:42:13 +0000 Subject: [PATCH 3/6] Renames downloaded data file to _data.zip and removes it after extraction to streamline data handling --- Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 58ec83b..d743dd7 100644 --- a/Makefile +++ b/Makefile @@ -106,8 +106,9 @@ create_tests: .PHONY: get_data get_data: mkdir -p data/raw - wget -O data/raw/Practice_Level_Crosstab_Sep_24.zip https://files.digital.nhs.uk/A5/B4AB19/Practice_Level_Crosstab_Sep_24.zip - unzip -o data/raw/Practice_Level_Crosstab_Sep_24.zip -d data/raw + wget -O data/raw/_data.zip https://files.digital.nhs.uk/A5/B4AB19/Practice_Level_Crosstab_Sep_24.zip + unzip -o data/raw/_data.zip -d data/raw + rm data/raw/_data.zip ################################################################################# @@ -152,4 +153,4 @@ endef export PRINT_HELP_PYSCRIPT help: - @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST) + @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST) \ No newline at end of file From f824aa98f7477c8ac544df0b390dd0c5d5c8bfe3 Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 18:42:23 +0000 Subject: [PATCH 4/6] Removes placeholder return statements from unimplemented functions in data processing and aggregation modules --- code_your_own_pandas_pipeline/aggregations.py | 6 ------ code_your_own_pandas_pipeline/data_in.py | 9 ++------- code_your_own_pandas_pipeline/pipeline.py | 7 ++++--- code_your_own_pandas_pipeline/plots.py | 2 +- code_your_own_pandas_pipeline/processing.py | 4 ---- 5 files changed, 7 insertions(+), 21 deletions(-) diff --git a/code_your_own_pandas_pipeline/aggregations.py b/code_your_own_pandas_pipeline/aggregations.py index 6397cc4..06d99dc 100644 --- a/code_your_own_pandas_pipeline/aggregations.py +++ b/code_your_own_pandas_pipeline/aggregations.py @@ -26,8 +26,6 @@ def pivot_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: logger.warning("This function is not yet implemented.") - return placeholder_df - def summarize_monthly_gp_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: """ @@ -47,8 +45,6 @@ def summarize_monthly_gp_appointments(pivot_practice_data: pd.DataFrame) -> pd.D logger.warning("This function is not yet implemented.") - return placeholder_df - def summarize_monthly_region_appointments(pivot_practice_data: pd.DataFrame) -> pd.DataFrame: """ @@ -67,5 +63,3 @@ def summarize_monthly_region_appointments(pivot_practice_data: pd.DataFrame) -> logger.info("Summarizing the monthly region appointments.") logger.warning("This function is not yet implemented.") - - return placeholder_df diff --git a/code_your_own_pandas_pipeline/data_in.py b/code_your_own_pandas_pipeline/data_in.py index e3df794..d3de9d6 100644 --- a/code_your_own_pandas_pipeline/data_in.py +++ b/code_your_own_pandas_pipeline/data_in.py @@ -1,12 +1,11 @@ """ -This module contains the function to read the mapping and practice crosstab data from the data +This module contains the function to read the mapping and practice crosstab data from the data folder. """ + import pandas as pd from loguru import logger -placeholder_df = pd.DataFrame() - def read_mapping_data() -> pd.DataFrame: """ @@ -21,8 +20,6 @@ def read_mapping_data() -> pd.DataFrame: logger.warning("This function is not yet implemented.") - return placeholder_df - def read_practice_crosstab_data() -> pd.DataFrame: """ @@ -37,8 +34,6 @@ def read_practice_crosstab_data() -> pd.DataFrame: logger.warning("This function is not yet implemented.") - return placeholder_df - if __name__ == "__main__": read_mapping_data().head() diff --git a/code_your_own_pandas_pipeline/pipeline.py b/code_your_own_pandas_pipeline/pipeline.py index baaf377..64b8d2e 100644 --- a/code_your_own_pandas_pipeline/pipeline.py +++ b/code_your_own_pandas_pipeline/pipeline.py @@ -2,10 +2,10 @@ Main pipeline for the code_your_own_pandas_pipeline package. """ +import pandas as pd from loguru import logger -from code_your_own_pandas_pipeline import aggregations, config, data_in, plots, processing - +from code_your_own_pandas_pipeline import aggregations, data_in, plots, processing placeholder_df = pd.DataFrame() @@ -36,5 +36,6 @@ def main() -> None: logger.success("GP Appointment Data Pipeline Completed") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/code_your_own_pandas_pipeline/plots.py b/code_your_own_pandas_pipeline/plots.py index faa77fd..cf4d9cd 100644 --- a/code_your_own_pandas_pipeline/plots.py +++ b/code_your_own_pandas_pipeline/plots.py @@ -69,4 +69,4 @@ def plot_monthly_region_appointments( """ logger.info("Plotting the monthly region appointments.") - logger.warning("This function is not yet implemented.") \ No newline at end of file + logger.warning("This function is not yet implemented.") diff --git a/code_your_own_pandas_pipeline/processing.py b/code_your_own_pandas_pipeline/processing.py index 87b77f0..bac11f3 100644 --- a/code_your_own_pandas_pipeline/processing.py +++ b/code_your_own_pandas_pipeline/processing.py @@ -26,8 +26,6 @@ def tidy_practice_level_data(practice_data: pd.DataFrame) -> pd.DataFrame: logger.warning("This function is not yet implemented.") - return placeholder_df - def merge_mapping_and_practice_data( mapping_data: pd.DataFrame, practice_data: pd.DataFrame @@ -50,5 +48,3 @@ def merge_mapping_and_practice_data( logger.info("Merging the mapping and practice data.") logger.warning("This function is not yet implemented.") - - return placeholder_df From 6e1eba07a6ea5192ce07452a49de0302e9059869 Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 20:24:34 +0000 Subject: [PATCH 5/6] Removes unimplemented function warning and unnecessary main block from data input module --- code_your_own_pandas_pipeline/data_in.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/code_your_own_pandas_pipeline/data_in.py b/code_your_own_pandas_pipeline/data_in.py index d3de9d6..0212611 100644 --- a/code_your_own_pandas_pipeline/data_in.py +++ b/code_your_own_pandas_pipeline/data_in.py @@ -33,8 +33,3 @@ def read_practice_crosstab_data() -> pd.DataFrame: logger.info(f"Reading practice crosstab data from {""}") logger.warning("This function is not yet implemented.") - - -if __name__ == "__main__": - read_mapping_data().head() - read_practice_crosstab_data().head() From b0d615183bc0e6fcdef59fd787597f993bc3658a Mon Sep 17 00:00:00 2001 From: "joseph.wilson8-nhs" Date: Tue, 17 Dec 2024 20:27:25 +0000 Subject: [PATCH 6/6] WIP Adds tests for pipeline code --- tests/unittests/test_aggregations.py | 212 +++++++++++++++++++++++++++ tests/unittests/test_data_in.py | 118 +++++++++++++++ tests/unittests/test_pipeline.py | 13 ++ tests/unittests/test_plots.py | 13 ++ tests/unittests/test_processing.py | 13 ++ 5 files changed, 369 insertions(+) create mode 100644 tests/unittests/test_aggregations.py create mode 100644 tests/unittests/test_data_in.py create mode 100644 tests/unittests/test_pipeline.py create mode 100644 tests/unittests/test_plots.py create mode 100644 tests/unittests/test_processing.py diff --git a/tests/unittests/test_aggregations.py b/tests/unittests/test_aggregations.py new file mode 100644 index 0000000..beb45f1 --- /dev/null +++ b/tests/unittests/test_aggregations.py @@ -0,0 +1,212 @@ +""" +Tests for code_your_own_pandas_pipeline.aggregations +""" + +import pandas as pd +import pytest + +from code_your_own_pandas_pipeline.aggregations import ( + pivot_practice_level_data, + summarize_monthly_gp_appointments, + summarize_monthly_region_appointments, +) + + +@pytest.fixture +def practice_test_data(): + return pd.DataFrame( + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "REGION_NAME", + "APPT_STATUS", + "COUNT_OF_APPOINTMENTS", + ], + data=[ + ["2021-01-01", "Example GP A", "REGION1", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION1", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION1", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION1", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION2", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION2", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION2", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION2", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION1", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION1", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION1", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION1", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION2", "ATTENDED", 1], + ["2021-01-01", "Example GP B", "REGION2", "ATTENDED", 4], + ["2021-02-01", "Example GP A", "REGION2", "ATTENDED", 7], + ["2021-02-01", "Example GP B", "REGION2", "ATTENDED", 10], + ["2021-01-01", "Example GP A", "REGION1", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION1", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION1", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION1", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION2", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION2", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION2", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION2", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION1", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION1", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION1", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION1", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION2", "DID NOT ATTEND", 2], + ["2021-01-01", "Example GP B", "REGION2", "DID NOT ATTEND", 5], + ["2021-02-01", "Example GP A", "REGION2", "DID NOT ATTEND", 8], + ["2021-02-01", "Example GP B", "REGION2", "DID NOT ATTEND", 11], + ["2021-01-01", "Example GP A", "REGION1", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION1", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION1", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION1", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION2", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION2", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION2", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION2", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION1", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION1", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION1", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION1", "UNKNOWN", 12], + ["2021-01-01", "Example GP A", "REGION2", "UNKNOWN", 3], + ["2021-01-01", "Example GP B", "REGION2", "UNKNOWN", 6], + ["2021-02-01", "Example GP A", "REGION2", "UNKNOWN", 9], + ["2021-02-01", "Example GP B", "REGION2", "UNKNOWN", 12], + ], + ) + + +@pytest.fixture +def practice_pivot_test_data(): + return pd.DataFrame( + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "REGION_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + data=[ + ["2021-01-01", "Example GP A", "REGION1", 1, 2, 3], + ["2021-01-01", "Example GP B", "REGION1", 4, 5, 6], + ["2021-02-01", "Example GP A", "REGION1", 7, 8, 9], + ["2021-02-01", "Example GP B", "REGION1", 10, 11, 12], + ["2021-01-01", "Example GP A", "REGION2", 1, 2, 3], + ["2021-01-01", "Example GP B", "REGION2", 4, 5, 6], + ["2021-02-01", "Example GP A", "REGION2", 7, 8, 9], + ["2021-02-01", "Example GP B", "REGION2", 10, 11, 12], + ] + * 2, + ) + + +class TestPivotPracticeLevelData: + """ + Tests for the pivot_practice_level_data function. + """ + + def test_returns_dataframe(self, practice_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = pivot_practice_level_data(practice_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = pivot_practice_level_data(practice_test_data) + assert not actual.empty + + def test_return_pivoted_data(self, practice_test_data): + """ + Check that the function returns the pivoted data. + """ + actual = pivot_practice_level_data(practice_test_data, practice_pivot_test_data) + expected = practice_pivot_test_data + assert actual.assert_frame_equal(expected) + + +class TestSummarizeMonthlyGPAppointments: + """ + Tests for the summarize_monthly_gp_appointments function. + """ + + def test_returns_dataframe(self, practice_pivot_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_pivot_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + assert not actual.empty + + def test_return_summarized_data(self, practice_pivot_test_data): + """ + Check that the function returns the summarized data. + """ + actual = summarize_monthly_gp_appointments(practice_pivot_test_data) + expected = pd.DataFrame( + [ + ["2021-01-01", "Example GP A", 4, 8, 12], + ["2021-01-01", "Example GP B", 16, 20, 24], + ["2021-02-01", "Example GP A", 28, 32, 36], + ["2021-02-01", "Example GP B", 40, 44, 48], + ], + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + ) + assert actual.assert_frame_equal(expected) + + +class TestSummarizeMonthlyRegionAppointments: + """ + Tests for the summarize_monthly_region_appointments function. + """ + + def test_returns_dataframe(self, practice_pivot_test_data): + """ + Check that the function returns a DataFrame. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self, practice_pivot_test_data): + """ + Check that the function returns a non-empty DataFrame. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + assert not actual.empty + + def test_return_summarized_data(self, practice_pivot_test_data): + """ + Check that the function returns the summarized data. + """ + actual = summarize_monthly_region_appointments(practice_pivot_test_data) + expected = pd.DataFrame( + [ + ["2021-01-01", "Example GP A", 4, 8, 12], + ["2021-01-01", "Example GP B", 16, 20, 24], + ["2021-02-01", "Example GP A", 28, 32, 36], + ["2021-02-01", "Example GP B", 40, 44, 48], + ], + columns=[ + "APPOINTMENT_MONTH_START_DATE", + "GP_NAME", + "ATTENDED", + "DID NOT ATTEND", + "UNKNOWN", + ], + ) + assert actual.assert_frame_equal(expected) diff --git a/tests/unittests/test_data_in.py b/tests/unittests/test_data_in.py new file mode 100644 index 0000000..cbb8b08 --- /dev/null +++ b/tests/unittests/test_data_in.py @@ -0,0 +1,118 @@ +""" +Tests for code_your_own_pandas_pipeline.data_in +""" +import pandas as pd +import numpy as np + +from code_your_own_pandas_pipeline.data_in import read_mapping_data, read_practice_crosstab_data + + +class TestReadMappingData: + """ + Tests for the read_mapping_data function. + """ + + def test_return_type(self): + """ + Test that the read_mapping_data function returns a pandas DataFrame. + """ + actual = read_mapping_data() + + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self): + """ + Tests that the read_mapping_data function returns a non-empty DataFrame + """ + actual = read_mapping_data() + + assert not actual.empty + + def test_shape(self): + """ + Tests that the read_mapping_data function returns a DataFrame with the correct shape. + """ + actual = read_mapping_data() + + assert actual.shape == (6241, 11) + + def test_schema(self): + """ + Tests that the read_mapping_data function returns a DataFrame with the correct schema. + """ + actual = read_mapping_data() + + expected_schema = pd.Series( + { + "GP_CODE": np.dtype("O"), + "GP_NAME": np.dtype("O"), + "SUPPLIER": np.dtype("O"), + "PCN_CODE": np.dtype("O"), + "PCN_NAME": np.dtype("O"), + "SUB_ICB_LOCATION_CODE": np.dtype("O"), + "SUB_ICB_LOCATION_NAME": np.dtype("O"), + "ICB_CODE": np.dtype("O"), + "ICB_NAME": np.dtype("O"), + "REGION_CODE": np.dtype("O"), + "REGION_NAME": np.dtype("O"), + } + ) + + assert expected_schema.assert_series_equal(actual.dtypes) + + +class TestReadPracticeCrosstabData: + """ + Tests for the read_practice_crosstab_data function. + """ + + def test_return_type(self): + """ + Test that the read_practice_crosstab_data function returns a pandas DataFrame. + """ + actual = read_practice_crosstab_data() + + assert isinstance(actual, pd.DataFrame) + + def test_return_not_empty(self): + """ + Tests that the read_practice_crosstab_data function returns a non-empty DataFrame + """ + actual = read_practice_crosstab_data() + + assert not actual.empty + + def test_shape(self): + """ + Tests that the read_practice_crosstab_data function returns a DataFrame with the correct shape. + """ + actual = read_practice_crosstab_data() + + assert actual.shape == (2971190, 14) + + def test_schema(self): + """ + Tests that the read_practice_crosstab_data function returns a DataFrame with the correct schema. + """ + actual = read_practice_crosstab_data() + + expected_schema = pd.Series( + { + "APPOINTMENT_MONTH_START_DATE": np.dtype("O"), + "GP_CODE": np.dtype("O"), + "GP_NAME": np.dtype("O"), + "SUPPLIER": np.dtype("O"), + "PCN_CODE": np.dtype("O"), + "PCN_NAME": np.dtype("O"), + "SUB_ICB_LOCATION_CODE": np.dtype("O"), + "SUB_ICB_LOCATION_NAME": np.dtype("O"), + "HCP_TYPE": np.dtype("O"), + "APPT_MODE": np.dtype("O"), + "NATIONAL_CATEGORY": np.dtype("O"), + "TIME_BETWEEN_BOOK_AND_APPT": np.dtype("O"), + "COUNT_OF_APPOINTMENTS": np.dtype("int64"), + "APPT_STATUS": np.dtype("O"), + } + ) + + assert expected_schema.assert_series_equal(actual.dtypes) diff --git a/tests/unittests/test_pipeline.py b/tests/unittests/test_pipeline.py new file mode 100644 index 0000000..59c441a --- /dev/null +++ b/tests/unittests/test_pipeline.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.pipeline +""" +import pytest + +import code_your_own_pandas_pipeline.pipeline + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True diff --git a/tests/unittests/test_plots.py b/tests/unittests/test_plots.py new file mode 100644 index 0000000..c4d1fed --- /dev/null +++ b/tests/unittests/test_plots.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.plots +""" +import pytest + +import code_your_own_pandas_pipeline.plots + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True diff --git a/tests/unittests/test_processing.py b/tests/unittests/test_processing.py new file mode 100644 index 0000000..243dbe9 --- /dev/null +++ b/tests/unittests/test_processing.py @@ -0,0 +1,13 @@ +""" +Tests for code_your_own_pandas_pipeline.processing +""" +import pytest + +import code_your_own_pandas_pipeline.processing + + +class TestExample: + """Example test class""" + def test_example(self): + """Example test case""" + assert True