From 102fc4af221f8ab71e40aa04d401cfbaad498df2 Mon Sep 17 00:00:00 2001 From: Rui Braz Date: Sat, 20 Sep 2025 14:42:04 +0100 Subject: [PATCH] Solved lab --- .../cleaning_functions-checkpoint.py | 63 + ...structuring-and-combining-checkpoint.ipynb | 1447 +++++++++++++++++ .../cleaning_functions.cpython-313.pyc | Bin 0 -> 3717 bytes cleaning_functions.py | 63 + lab-dw-data-structuring-and-combining.ipynb | 1299 ++++++++++++++- 5 files changed, 2862 insertions(+), 10 deletions(-) create mode 100644 .ipynb_checkpoints/cleaning_functions-checkpoint.py create mode 100644 .ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb create mode 100644 __pycache__/cleaning_functions.cpython-313.pyc create mode 100644 cleaning_functions.py diff --git a/.ipynb_checkpoints/cleaning_functions-checkpoint.py b/.ipynb_checkpoints/cleaning_functions-checkpoint.py new file mode 100644 index 0000000..59b46db --- /dev/null +++ b/.ipynb_checkpoints/cleaning_functions-checkpoint.py @@ -0,0 +1,63 @@ +# cleaning_functions.py + +import pandas as pd + +def clean_column_names(df): + """Standardizes all column names to be lowercase and use underscores.""" + df.columns = [col.lower().replace(' ', '_') for col in df.columns] + df.rename(columns={'st': 'state'}, inplace=True) + return df + +def clean_and_format_data(df): + """Cleans inconsistent values and formats data types for specific columns.""" + # Clean gender + gender_map = {'F': 'F', 'M': 'M', 'Male': 'M', 'female': 'F', 'Femal': 'F'} + df['gender'] = df['gender'].map(gender_map) + + # Clean and format customer_lifetime_value + df['customer_lifetime_value'] = df['customer_lifetime_value'].str.replace('%', '') + df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce') + + # Clean and format number_of_open_complaints + df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna('0/0/0') + df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).apply(lambda x: x.split('/')[1] if '/' in x else x) + df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints']) + + return df + +def handle_null_values(df): + """Drops empty rows and fills remaining NaN values with appropriate measures.""" + # Drop rows that are completely empty + df.dropna(how='all', inplace=True) + + # Fill categorical columns + df['gender'] = df['gender'].fillna(df['gender'].mode()[0]) + + # Fill numerical columns + df['customer_lifetime_value'].fillna(df['customer_lifetime_value'].mean()) + df['income'].fillna(df['income'].median()) + df['monthly_premium_auto'].fillna(df['monthly_premium_auto'].mean()) + + # Drop any other rows that still have nulls + df.dropna(inplace=True) + return df + +def convert_to_integer(df): + """Converts all numeric columns to integer type.""" + numeric_cols = df.select_dtypes(include='number').columns + for col in numeric_cols: + df[col] = df[col].astype(int) + return df + + +def clean_customer_data(df): + """Main function to run all data cleaning and formatting steps.""" + df = clean_column_names(df) + df = clean_and_format_data(df) + df = handle_null_values(df) + df = convert_to_integer(df) + df.drop_duplicates() + df.reset_index(drop=True, inplace=True) + + print("Data cleaning and formatting complete.") + return df \ No newline at end of file diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..71488dd --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "url1 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "url2 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "\n", + "df1 = pd.read_csv(url1)\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fb2f387f-b5a7-4d88-8223-ff941c6304d1", + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = pd.concat([df1, df2, df3], axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1529d380-9d04-4682-8592-1f58ef38c596", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim AmountStateGender
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323NaNNaN
..........................................
7065LA72316NaNNaNBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764CaliforniaM
7066PK87824NaNNaNCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000CaliforniaF
7067TD14365NaNNaNBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983CaliforniaM
7068UP19263NaNNaNCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000CaliforniaM
7069Y167826NaNNaNCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000CaliforniaM
\n", + "

12074 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 NaN NaN Bachelor \n", + "7066 PK87824 NaN NaN College \n", + "7067 TD14365 NaN NaN Bachelor \n", + "7068 UP19263 NaN NaN College \n", + "7069 Y167826 NaN NaN College \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " Total Claim Amount State Gender \n", + "0 2.704934 NaN NaN \n", + "1 1131.464935 NaN NaN \n", + "2 566.472247 NaN NaN \n", + "3 529.881344 NaN NaN \n", + "4 17.269323 NaN NaN \n", + "... ... ... ... \n", + "7065 198.234764 California M \n", + "7066 379.200000 California F \n", + "7067 790.784983 California M \n", + "7068 691.200000 California M \n", + "7069 369.600000 California M \n", + "\n", + "[12074 rows x 13 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b7ab7315-82b0-43cc-8c85-eda04a9728f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Customer 1\n", + "ST 1\n", + "GENDER 1\n", + "Education 1\n", + "Customer Lifetime Value 1\n", + "Income 1\n", + "Monthly Premium Auto 1\n", + "Number of Open Complaints 1\n", + "Policy Type 1\n", + "Vehicle Class 1\n", + "Total Claim Amount 1\n", + "State 1\n", + "Gender 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print(combined_df.columns.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "eb302317-7356-40cf-b7b5-b188e849803a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 12074 entries, 0 to 7069\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 9137 non-null object \n", + " 1 ST 2067 non-null object \n", + " 2 GENDER 1945 non-null object \n", + " 3 Education 9137 non-null object \n", + " 4 Customer Lifetime Value 9130 non-null object \n", + " 5 Income 9137 non-null float64\n", + " 6 Monthly Premium Auto 9137 non-null float64\n", + " 7 Number of Open Complaints 9137 non-null object \n", + " 8 Policy Type 9137 non-null object \n", + " 9 Vehicle Class 9137 non-null object \n", + " 10 Total Claim Amount 9137 non-null float64\n", + " 11 State 7070 non-null object \n", + " 12 Gender 7070 non-null object \n", + "dtypes: float64(3), object(10)\n", + "memory usage: 1.3+ MB\n", + "None\n" + ] + } + ], + "source": [ + "print(combined_df.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ca44b2ea-b44f-4a19-81f9-294c2483f81d", + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Consolidate the State columns\n", + "# This will take values from 'State' and use them to fill any empty spots in 'ST'\n", + "combined_df['ST'] = combined_df['ST'].fillna(combined_df['State'])\n", + "\n", + "# 2. Consolidate the Gender columns\n", + "# This does the same for the gender columns\n", + "combined_df['GENDER'] = combined_df['GENDER'].fillna(combined_df['Gender'])\n", + "\n", + "# 3. Drop the now-redundant columns\n", + "combined_df.drop(columns=['State', 'Gender'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "45223c1b-4ea3-4f2c-9745-76fb802c2fbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
7065LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
7066PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
7067TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
7068UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
7069Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California M Bachelor \n", + "7066 PK87824 California F College \n", + "7067 TD14365 California M Bachelor \n", + "7068 UP19263 California M College \n", + "7069 Y167826 California M College \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "7065 198.234764 \n", + "7066 379.200000 \n", + "7067 790.784983 \n", + "7068 691.200000 \n", + "7069 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3c1400ee-89c6-48f3-acfc-269929a91c59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data cleaning and formatting complete.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131
1AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566
2WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529
3GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17
4OC83172OregonFBachelor82562962902690Personal AutoTwo-Door Car159
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 QZ44356 Arizona F Bachelor 697953 \n", + "1 AI49188 Nevada F Bachelor 1288743 \n", + "2 WW63253 California M Bachelor 764586 \n", + "3 GA49547 Washington M High School or Below 536307 \n", + "4 OC83172 Oregon F Bachelor 825629 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 94 0 Personal Auto \n", + "1 48767 108 0 Personal Auto \n", + "2 0 106 0 Corporate Auto \n", + "3 36357 68 0 Personal Auto \n", + "4 62902 69 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 1131 \n", + "1 Two-Door Car 566 \n", + "2 SUV 529 \n", + "3 Four-Door Car 17 \n", + "4 Two-Door Car 159 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from cleaning_functions import clean_customer_data\n", + "\n", + "cleaned_df = clean_customer_data(combined_df.copy())\n", + "\n", + "cleaned_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url4 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "\n", + "df4 = pd.read_csv(url4)\n", + "\n", + "df4.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4d95fdb8-2087-4860-9d27-d6337a81e105", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 unnamed:_0 10910 non-null int64 \n", + " 1 customer 10910 non-null object \n", + " 2 state 10910 non-null object \n", + " 3 customer_lifetime_value 10910 non-null float64\n", + " 4 response 10910 non-null object \n", + " 5 coverage 10910 non-null object \n", + " 6 education 10910 non-null object \n", + " 7 effective_to_date 10910 non-null object \n", + " 8 employmentstatus 10910 non-null object \n", + " 9 gender 10910 non-null object \n", + " 10 income 10910 non-null int64 \n", + " 11 location_code 10910 non-null object \n", + " 12 marital_status 10910 non-null object \n", + " 13 monthly_premium_auto 10910 non-null int64 \n", + " 14 months_since_last_claim 10910 non-null float64\n", + " 15 months_since_policy_inception 10910 non-null int64 \n", + " 16 number_of_open_complaints 10910 non-null float64\n", + " 17 number_of_policies 10910 non-null int64 \n", + " 18 policy_type 10910 non-null object \n", + " 19 policy 10910 non-null object \n", + " 20 renew_offer_type 10910 non-null object \n", + " 21 sales_channel 10910 non-null object \n", + " 22 total_claim_amount 10910 non-null float64\n", + " 23 vehicle_class 10910 non-null object \n", + " 24 vehicle_size 10910 non-null object \n", + " 25 vehicle_type 10910 non-null object \n", + " 26 month 10910 non-null int64 \n", + "dtypes: float64(4), int64(6), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "df4.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f1354328-a847-4242-a926-3a797a0ea766", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 monthly_premium_auto
sales_channel 
Agent$386,335.00
Branch$280,953.00
Call Center$197,970.00
Web$151,511.00
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#1. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail). Round the total revenue to 2 decimal points\n", + "\n", + "pivot_revenue = df4.pivot_table(\n", + " index='sales_channel',\n", + " values='monthly_premium_auto',\n", + " aggfunc='sum'\n", + ")\n", + "\n", + "pivot_revenue.style.format('${:,.2f}')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "78adae52-13ec-4278-9ab4-f78415d909d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMaster
gender
F7874.2694787748.8233257328.5089168675.2202018157.053154
M7703.6016758052.4592887415.3336388149.6877838168.832659
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below \\\n", + "gender \n", + "F 7874.269478 7748.823325 7328.508916 8675.220201 \n", + "M 7703.601675 8052.459288 7415.333638 8149.687783 \n", + "\n", + "education Master \n", + "gender \n", + "F 8157.053154 \n", + "M 8168.832659 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#2. Create a pivot table that shows the average customer lifetime value per gender and education level. \n", + "\n", + "pivot_df4 = df4.pivot_table(\n", + " index='gender',\n", + " columns='education',\n", + " values='customer_lifetime_value',\n", + " aggfunc='mean'\n", + ")\n", + "\n", + "pivot_df4" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [], + "source": [ + "df4['effective_to_date'] = pd.to_datetime(df4['effective_to_date'])\n", + "\n", + "df4['month'] = df4['effective_to_date'].dt.month" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "39db13d1-0f83-4f12-bf9e-e6906631cf0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto 1 443.434952\n", + "1 Corporate Auto 2 385.208135\n", + "2 Personal Auto 1 1727.605722\n", + "3 Personal Auto 2 1453.684441\n", + "4 Special Auto 1 87.074049\n", + "5 Special Auto 2 95.226817\n" + ] + } + ], + "source": [ + "complaints_summary = df4.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "\n", + "print(complaints_summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e68f82-ddb0-40e6-bb57-d73401f8de8a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/__pycache__/cleaning_functions.cpython-313.pyc b/__pycache__/cleaning_functions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4662fc96540da15f7298ba928f01ae5c889523b GIT binary patch literal 3717 zcma)8O>7&-72aJgN&Q&QQ+Fd#q_(QaG>%YPkzGf&WXqQ1RBOdPs=-4WIJS134Vhz)uC#Ni)-|Lz~5dMGRi^YCpiN#R|QN;!c_N#Sb0 zAb7{-QKqeoHv03?wh8EJVBKiL?sufwlpu5q3g~mi1;K7DhuzVhwxYplUKceu$2my> z3sWE$$}S0Ne0@Umb+@RKqP1>PMYnCm@a(eVDz5IpfbS^u{bgLFpCirw*nEV zxWpnLM)9PI!hVulps-1fl&u2M*K^avbC-4F=ec**{AJJ0*?M85xH3}IeSL)bq-^+Q zV!3xlz{U~7a|)IVy_{j2IyUl4W!La6&!ySY8cXoud}w5zpOqYiZSnHy3{(~2i?`1I zb?o!8+Oet6#~xk!P<^2OcDSAx3=)Hz#@3mCByxWq3yw`yCVqKsM~?k!`rhhn+Oa)b|{d-XYk8F>;ftm?U8s>cCg#3Ee*;sZv}9X8G{jYkLkhtsVjbhQ?i7mp1(pes1=p4j zuPu+amL0b}L$~P#XoL$-f*3{8s|eNdYW42>wetJbf>AG6LBXp2%&8UJs#gjMzj!JL zhBzs~=d#2<*xLm`LG54>M=!Ampr4@kF{xB!UA~uxdoqb-z`a|x?dmM1Q+zfS)k`IN zO_d0GAL|OAArCqloD&}A_<=_tS&+e9=o6?a!qb@0cWN`aWqhVRJ`)_fQMtC0Jbr)q z!FWA65F`gSuho*d%0y-L-VCf9%v3WMgVfvg)MSvFtfyv!)a+xUsx8z~?^Y(NaV3yn zdy-0jGW*f&qyFmXT=2$?>g}HfDZM!fIx=ciy&R+`>*<*wJ@fchExmxV{ee8NlOEbT z_cyJU{!u-BD@fn^S`dC5-dAQyYA%109{R-j$ayqT{lR?jCM>?$ocRK2iwwd6_qh^! z1E;E0cyk`-8uSC>t-=W`5=NN@b)igK0mz25hC16yGX{oKwg;?Dii@GpLD3VmhI8>Y zmC$W!ZR*1)sKsBRDV&46p}icaspBP@!nto~(}9qbfNDjw7>KHC{n};Xm8fDmC4WsJ z-b#a>z{#mXfLeISQ|9!!CVQ<|{;~p24`RfE(5yJ7PRl%MMe0mD+CWq5RwEr znDjx#Vn~OM$$A~n^_T6nd0Rg=yLVTrzWO|cq z4gLOZJ##k5oULWXKKp)+)bA4{jE!>JNkT#d>-YFjz}p z*_N+@Nev^5XyPdz^j}1e3@75~xeJY#q34`9-wJ`Sv{4`x5FvAf%WG%;|#l8 zgglP*P}4RIKVRgbkf(KI7`9>&5LP20xgKih#vF^1!!S+p4iu#kF%l4J(U3gQbz@I* z5f+T#%5kV514|wcxpeDlH91gCj@FasgXHe{6PA3 z{OgEt^ws~zV1EMD4~4}Re=W3<30GKb@zuUj1UN7IMq8-fI|_mICA@(+o*KQkaHW~Y;$xAr%iobFCOm3Ern@I)@v#f(Oo?U@>M}x=<90oL z8FQKcCBhR6(>JqfoTtS+_Hl^vH)2OnQNCD)T)+TUp{yGy#q{AJi{>he0@t`cISY5T z7xl?7thRC\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim AmountStateGender
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323NaNNaN
..........................................
7065LA72316NaNNaNBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764CaliforniaM
7066PK87824NaNNaNCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000CaliforniaF
7067TD14365NaNNaNBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983CaliforniaM
7068UP19263NaNNaNCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000CaliforniaM
7069Y167826NaNNaNCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000CaliforniaM
\n", + "

12074 rows × 13 columns

\n", + "" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 NaN NaN Bachelor \n", + "7066 PK87824 NaN NaN College \n", + "7067 TD14365 NaN NaN Bachelor \n", + "7068 UP19263 NaN NaN College \n", + "7069 Y167826 NaN NaN College \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " Total Claim Amount State Gender \n", + "0 2.704934 NaN NaN \n", + "1 1131.464935 NaN NaN \n", + "2 566.472247 NaN NaN \n", + "3 529.881344 NaN NaN \n", + "4 17.269323 NaN NaN \n", + "... ... ... ... \n", + "7065 198.234764 California M \n", + "7066 379.200000 California F \n", + "7067 790.784983 California M \n", + "7068 691.200000 California M \n", + "7069 369.600000 California M \n", + "\n", + "[12074 rows x 13 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b7ab7315-82b0-43cc-8c85-eda04a9728f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Customer 1\n", + "ST 1\n", + "GENDER 1\n", + "Education 1\n", + "Customer Lifetime Value 1\n", + "Income 1\n", + "Monthly Premium Auto 1\n", + "Number of Open Complaints 1\n", + "Policy Type 1\n", + "Vehicle Class 1\n", + "Total Claim Amount 1\n", + "State 1\n", + "Gender 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print(combined_df.columns.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "eb302317-7356-40cf-b7b5-b188e849803a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 12074 entries, 0 to 7069\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 9137 non-null object \n", + " 1 ST 2067 non-null object \n", + " 2 GENDER 1945 non-null object \n", + " 3 Education 9137 non-null object \n", + " 4 Customer Lifetime Value 9130 non-null object \n", + " 5 Income 9137 non-null float64\n", + " 6 Monthly Premium Auto 9137 non-null float64\n", + " 7 Number of Open Complaints 9137 non-null object \n", + " 8 Policy Type 9137 non-null object \n", + " 9 Vehicle Class 9137 non-null object \n", + " 10 Total Claim Amount 9137 non-null float64\n", + " 11 State 7070 non-null object \n", + " 12 Gender 7070 non-null object \n", + "dtypes: float64(3), object(10)\n", + "memory usage: 1.3+ MB\n", + "None\n" + ] + } + ], + "source": [ + "print(combined_df.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ca44b2ea-b44f-4a19-81f9-294c2483f81d", + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Consolidate the State columns\n", + "# This will take values from 'State' and use them to fill any empty spots in 'ST'\n", + "combined_df['ST'] = combined_df['ST'].fillna(combined_df['State'])\n", + "\n", + "# 2. Consolidate the Gender columns\n", + "# This does the same for the gender columns\n", + "combined_df['GENDER'] = combined_df['GENDER'].fillna(combined_df['Gender'])\n", + "\n", + "# 3. Drop the now-redundant columns\n", + "combined_df.drop(columns=['State', 'Gender'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "45223c1b-4ea3-4f2c-9745-76fb802c2fbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
7065LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
7066PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
7067TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
7068UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
7069Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California M Bachelor \n", + "7066 PK87824 California F College \n", + "7067 TD14365 California M Bachelor \n", + "7068 UP19263 California M College \n", + "7069 Y167826 California M College \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "7065 23405.98798 71941.0 73.0 \n", + "7066 3096.511217 21604.0 79.0 \n", + "7067 8163.890428 0.0 85.0 \n", + "7068 7524.442436 21941.0 96.0 \n", + "7069 2611.836866 0.0 77.0 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "7065 0 Personal Auto Four-Door Car \n", + "7066 0 Corporate Auto Four-Door Car \n", + "7067 3 Corporate Auto Four-Door Car \n", + "7068 0 Personal Auto Four-Door Car \n", + "7069 0 Corporate Auto Two-Door Car \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "7065 198.234764 \n", + "7066 379.200000 \n", + "7067 790.784983 \n", + "7068 691.200000 \n", + "7069 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3c1400ee-89c6-48f3-acfc-269929a91c59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data cleaning and formatting complete.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131
1AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566
2WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529
3GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17
4OC83172OregonFBachelor82562962902690Personal AutoTwo-Door Car159
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 QZ44356 Arizona F Bachelor 697953 \n", + "1 AI49188 Nevada F Bachelor 1288743 \n", + "2 WW63253 California M Bachelor 764586 \n", + "3 GA49547 Washington M High School or Below 536307 \n", + "4 OC83172 Oregon F Bachelor 825629 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 94 0 Personal Auto \n", + "1 48767 108 0 Personal Auto \n", + "2 0 106 0 Corporate Auto \n", + "3 36357 68 0 Personal Auto \n", + "4 62902 69 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 1131 \n", + "1 Two-Door Car 566 \n", + "2 SUV 529 \n", + "3 Four-Door Car 17 \n", + "4 Two-Door Car 159 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from cleaning_functions import clean_customer_data\n", + "\n", + "cleaned_df = clean_customer_data(combined_df.copy())\n", + "\n", + "cleaned_df.head()" ] }, { @@ -72,14 +890,439 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url4 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "\n", + "df4 = pd.read_csv(url4)\n", + "\n", + "df4.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4d95fdb8-2087-4860-9d27-d6337a81e105", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 unnamed:_0 10910 non-null int64 \n", + " 1 customer 10910 non-null object \n", + " 2 state 10910 non-null object \n", + " 3 customer_lifetime_value 10910 non-null float64\n", + " 4 response 10910 non-null object \n", + " 5 coverage 10910 non-null object \n", + " 6 education 10910 non-null object \n", + " 7 effective_to_date 10910 non-null object \n", + " 8 employmentstatus 10910 non-null object \n", + " 9 gender 10910 non-null object \n", + " 10 income 10910 non-null int64 \n", + " 11 location_code 10910 non-null object \n", + " 12 marital_status 10910 non-null object \n", + " 13 monthly_premium_auto 10910 non-null int64 \n", + " 14 months_since_last_claim 10910 non-null float64\n", + " 15 months_since_policy_inception 10910 non-null int64 \n", + " 16 number_of_open_complaints 10910 non-null float64\n", + " 17 number_of_policies 10910 non-null int64 \n", + " 18 policy_type 10910 non-null object \n", + " 19 policy 10910 non-null object \n", + " 20 renew_offer_type 10910 non-null object \n", + " 21 sales_channel 10910 non-null object \n", + " 22 total_claim_amount 10910 non-null float64\n", + " 23 vehicle_class 10910 non-null object \n", + " 24 vehicle_size 10910 non-null object \n", + " 25 vehicle_type 10910 non-null object \n", + " 26 month 10910 non-null int64 \n", + "dtypes: float64(4), int64(6), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "df4.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f1354328-a847-4242-a926-3a797a0ea766", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 monthly_premium_auto
sales_channel 
Agent$386,335.00
Branch$280,953.00
Call Center$197,970.00
Web$151,511.00
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#1. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail). Round the total revenue to 2 decimal points\n", + "\n", + "pivot_revenue = df4.pivot_table(\n", + " index='sales_channel',\n", + " values='monthly_premium_auto',\n", + " aggfunc='sum'\n", + ")\n", + "\n", + "pivot_revenue.style.format('${:,.2f}')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "78adae52-13ec-4278-9ab4-f78415d909d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMaster
gender
F7874.2694787748.8233257328.5089168675.2202018157.053154
M7703.6016758052.4592887415.3336388149.6877838168.832659
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below \\\n", + "gender \n", + "F 7874.269478 7748.823325 7328.508916 8675.220201 \n", + "M 7703.601675 8052.459288 7415.333638 8149.687783 \n", + "\n", + "education Master \n", + "gender \n", + "F 8157.053154 \n", + "M 8168.832659 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "#2. Create a pivot table that shows the average customer lifetime value per gender and education level. \n", + "\n", + "pivot_df4 = df4.pivot_table(\n", + " index='gender',\n", + " columns='education',\n", + " values='customer_lifetime_value',\n", + " aggfunc='mean'\n", + ")\n", + "\n", + "pivot_df4" ] }, { @@ -130,15 +1373,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, "outputs": [], "source": [ - "# Your code goes here" + "df4['effective_to_date'] = pd.to_datetime(df4['effective_to_date'])\n", + "\n", + "df4['month'] = df4['effective_to_date'].dt.month" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "39db13d1-0f83-4f12-bf9e-e6906631cf0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto 1 443.434952\n", + "1 Corporate Auto 2 385.208135\n", + "2 Personal Auto 1 1727.605722\n", + "3 Personal Auto 2 1453.684441\n", + "4 Special Auto 1 87.074049\n", + "5 Special Auto 2 95.226817\n" + ] + } + ], + "source": [ + "complaints_summary = df4.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "\n", + "print(complaints_summary)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e68f82-ddb0-40e6-bb57-d73401f8de8a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -146,9 +1425,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -160,7 +1439,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,