From 5413e5be9353d5de7b0e63d944b6cfbdf8db6dd4 Mon Sep 17 00:00:00 2001 From: luispabloaiello-da Date: Wed, 10 Sep 2025 22:05:10 +0200 Subject: [PATCH] Solved lab --- ...structuring-and-combining-checkpoint.ipynb | 1981 +++++++++++++++++ lab-dw-data-structuring-and-combining.ipynb | 1833 ++++++++++++++- 2 files changed, 3804 insertions(+), 10 deletions(-) create mode 100644 .ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..e30623b --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1981 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code goes here\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "df1\n", + "# df.info()\n", + "# df.dtypes\n", + "# df.nunique()\n", + "# df.describe()\n", + "# display(df.isna().sum())\n", + "# display((df.isna().sum()/df.shape[0])*100)\n", + "# df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4a50bdcb-ec5e-42ab-b5ca-ee04958d2d2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
12069LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
12070PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
12071TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
12072UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
12073Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "12069 LA72316 California M Bachelor \n", + "12070 PK87824 California F College \n", + "12071 TD14365 California M Bachelor \n", + "12072 UP19263 California M College \n", + "12073 Y167826 California M College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "12069 23405.98798 71941.0 73.0 \n", + "12070 3096.511217 21604.0 79.0 \n", + "12071 8163.890428 0.0 85.0 \n", + "12072 7524.442436 21941.0 96.0 \n", + "12073 2611.836866 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "12069 0 Personal Auto Four-Door Car \n", + "12070 0 Corporate Auto Four-Door Car \n", + "12071 3 Corporate Auto Four-Door Car \n", + "12072 0 Personal Auto Four-Door Car \n", + "12073 0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "12069 198.234764 \n", + "12070 379.200000 \n", + "12071 790.784983 \n", + "12072 691.200000 \n", + "12073 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 2: Standardize column names\n", + "def clean_columns(df):\n", + " df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]\n", + " if 'st' in df.columns:\n", + " df.rename(columns={'st': 'state'}, inplace=True)\n", + " return df\n", + "\n", + "df1 = clean_columns(df1)\n", + "df2 = clean_columns(df2)\n", + "df3 = clean_columns(df3)\n", + "\n", + "# Step 3: Concatenate all dataframes\n", + "df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "23e40676-1b19-423d-8590-0bd11cbb5401", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 4: Clean 'gender' column (make consistent: 'M', 'F', None)\n", + "df['gender'] = df['gender'].astype(str).str.strip().str.upper().replace({'FEMALE': 'F', 'MALE': 'M', 'FEMAL': 'F', 'nan': None, 'NAN': None})\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "03d06e92-eb21-48dc-8ead-6abd1d236735", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.590.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.1748767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.180.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.6536357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59 \n", + "2 AI49188 Nevada F Bachelor 1288743.17 \n", + "3 WW63253 California M Bachelor 764586.18 \n", + "4 GA49547 Washington M High School or Below 536307.65 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 5: Clean 'customer_lifetime_value' column: remove '%' and convert to float\n", + "df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(str).str.replace('%', '', regex=False)\n", + "df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce')\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "61c55c39-51e8-40a6-b0f8-c94909fa279c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.590.094.01Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.1748767.0108.01Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.180.0106.01Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.6536357.068.01Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59 \n", + "2 AI49188 Nevada F Bachelor 1288743.17 \n", + "3 WW63253 California M Bachelor 764586.18 \n", + "4 GA49547 Washington M High School or Below 536307.65 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1 Personal Auto \n", + "1 0.0 94.0 1 Personal Auto \n", + "2 48767.0 108.0 1 Personal Auto \n", + "3 0.0 106.0 1 Corporate Auto \n", + "4 36357.0 68.0 1 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 6: Clean 'number_of_open_complaints' column: extract numeric value\n", + "df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract('(\\d+)')\n", + "df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').fillna(0).astype(int)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "37464a1a-c06a-4565-902c-d55884427187", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 2937\n", + "gender 3059\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 0\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(df.isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9e79ebe6-05ed-40f2-b02b-3a9eb96d7305", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0QZ44356ArizonaFBachelor6.979536e+050.094.01Personal AutoFour-Door Car1131.464935
1AI49188NevadaFBachelor1.288743e+0648767.0108.01Personal AutoTwo-Door Car566.472247
2WW63253CaliforniaMBachelor7.645862e+050.0106.01Corporate AutoSUV529.881344
3GA49547WashingtonMHigh School or Below5.363077e+0536357.068.01Personal AutoFour-Door Car17.269323
4OC83172OregonFBachelor8.256298e+0562902.069.01Personal AutoTwo-Door Car159.383042
....................................
9005LA72316CaliforniaMBachelor2.340599e+0471941.073.00Personal AutoFour-Door Car198.234764
9006PK87824CaliforniaFCollege3.096511e+0321604.079.00Corporate AutoFour-Door Car379.200000
9007TD14365CaliforniaMBachelor8.163890e+030.085.03Corporate AutoFour-Door Car790.784983
9008UP19263CaliforniaMCollege7.524442e+0321941.096.00Personal AutoFour-Door Car691.200000
9009Y167826CaliforniaMCollege2.611837e+030.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

9010 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 QZ44356 Arizona F Bachelor \n", + "1 AI49188 Nevada F Bachelor \n", + "2 WW63253 California M Bachelor \n", + "3 GA49547 Washington M High School or Below \n", + "4 OC83172 Oregon F Bachelor \n", + "... ... ... ... ... \n", + "9005 LA72316 California M Bachelor \n", + "9006 PK87824 California F College \n", + "9007 TD14365 California M Bachelor \n", + "9008 UP19263 California M College \n", + "9009 Y167826 California M College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 6.979536e+05 0.0 94.0 \n", + "1 1.288743e+06 48767.0 108.0 \n", + "2 7.645862e+05 0.0 106.0 \n", + "3 5.363077e+05 36357.0 68.0 \n", + "4 8.256298e+05 62902.0 69.0 \n", + "... ... ... ... \n", + "9005 2.340599e+04 71941.0 73.0 \n", + "9006 3.096511e+03 21604.0 79.0 \n", + "9007 8.163890e+03 0.0 85.0 \n", + "9008 7.524442e+03 21941.0 96.0 \n", + "9009 2.611837e+03 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 1 Personal Auto Four-Door Car \n", + "1 1 Personal Auto Two-Door Car \n", + "2 1 Corporate Auto SUV \n", + "3 1 Personal Auto Four-Door Car \n", + "4 1 Personal Auto Two-Door Car \n", + "... ... ... ... \n", + "9005 0 Personal Auto Four-Door Car \n", + "9006 0 Corporate Auto Four-Door Car \n", + "9007 3 Corporate Auto Four-Door Car \n", + "9008 0 Personal Auto Four-Door Car \n", + "9009 0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount \n", + "0 1131.464935 \n", + "1 566.472247 \n", + "2 529.881344 \n", + "3 17.269323 \n", + "4 159.383042 \n", + "... ... \n", + "9005 198.234764 \n", + "9006 379.200000 \n", + "9007 790.784983 \n", + "9008 691.200000 \n", + "9009 369.600000 \n", + "\n", + "[9010 rows x 11 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 7: Drop rows where all columns are NaN\n", + "df.dropna(inplace=True)\n", + "\n", + "# Step 8: Reset index\n", + "df.reset_index(drop=True, inplace=True)\n", + "\n", + "# Display cleaned dataframe\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgenderincome...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
0DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM48029...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
1KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF0...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
2LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM22139...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
3XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM49078...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
4QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF23675...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
..................................................................
10905FE99816Nevada15563.369440NoPremiumBachelor2011-01-19UnemployedF0...7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsizeA1
10906KX53892Oregon5259.444853NoBasicCollege2011-01-06EmployedF61146...6Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsizeA1
10907TL39050Arizona23893.304100NoExtendedBachelor2011-02-06EmployedF39837...2Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsizeA2
10908WA60547California11971.977650NoPremiumCollege2011-02-13EmployedF64195...6Personal AutoPersonal L1Offer1Branch618.288849SUVMedsizeA2
10909IV32877California6857.519928NoBasicBachelor2011-01-08UnemployedM0...3Personal AutoPersonal L1Offer4Web1021.719397SUVMedsizeA1
\n", + "

10910 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value response coverage \\\n", + "0 DK49336 Arizona 4809.216960 No Basic \n", + "1 KX64629 California 2228.525238 No Basic \n", + "2 LZ68649 Washington 14947.917300 No Basic \n", + "3 XL78013 Oregon 22332.439460 Yes Extended \n", + "4 QA50777 Oregon 9025.067525 No Premium \n", + "... ... ... ... ... ... \n", + "10905 FE99816 Nevada 15563.369440 No Premium \n", + "10906 KX53892 Oregon 5259.444853 No Basic \n", + "10907 TL39050 Arizona 23893.304100 No Extended \n", + "10908 WA60547 California 11971.977650 No Premium \n", + "10909 IV32877 California 6857.519928 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender income ... \\\n", + "0 College 2011-02-18 Employed M 48029 ... \n", + "1 College 2011-01-18 Unemployed F 0 ... \n", + "2 Bachelor 2011-02-10 Employed M 22139 ... \n", + "3 College 2011-01-11 Employed M 49078 ... \n", + "4 Bachelor 2011-01-17 Medical Leave F 23675 ... \n", + "... ... ... ... ... ... ... \n", + "10905 Bachelor 2011-01-19 Unemployed F 0 ... \n", + "10906 College 2011-01-06 Employed F 61146 ... \n", + "10907 Bachelor 2011-02-06 Employed F 39837 ... \n", + "10908 College 2011-02-13 Employed F 64195 ... \n", + "10909 Bachelor 2011-01-08 Unemployed M 0 ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "... ... ... ... ... \n", + "10905 7 Personal Auto Personal L1 Offer3 \n", + "10906 6 Personal Auto Personal L3 Offer2 \n", + "10907 2 Corporate Auto Corporate L3 Offer1 \n", + "10908 6 Personal Auto Personal L1 Offer1 \n", + "10909 3 Personal Auto Personal L1 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "... ... ... ... ... \n", + "10905 Web 1214.400000 Luxury Car Medsize \n", + "10906 Branch 273.018929 Four-Door Car Medsize \n", + "10907 Web 381.306996 Luxury SUV Medsize \n", + "10908 Branch 618.288849 SUV Medsize \n", + "10909 Web 1021.719397 SUV Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "... ... ... \n", + "10905 A 1 \n", + "10906 A 1 \n", + "10907 A 2 \n", + "10908 A 2 \n", + "10909 A 1 \n", + "\n", + "[10910 rows x 26 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code goes here\n", + "# Step 1: Load the marketing customer analysis data\n", + "marketing_df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "\n", + "# marketing_df.info()\n", + "# print()\n", + "# marketing_df.dtypes\n", + "# print()\n", + "# marketing_df.nunique()\n", + "# print()\n", + "# marketing_df.describe()\n", + "# print()\n", + "# display(marketing_df.isna().sum())\n", + "# print()\n", + "# display((marketing_df.isna().sum()/marketing_df.shape[0])*100)\n", + "print()\n", + "# df.head()\n", + "display(marketing_df.columns)\n", + "\n", + "# Step 2: Inspect and clean columns if needed\n", + "#marketing_df.columns = [c.strip().lower().replace(' ', '_') for c in marketing_df.columns]\n", + "\n", + "# Step 2: Remove column 'unnamed:_0' that seems to be the same as the implecit index\n", + "marketing_df = marketing_df.drop(['unnamed:_0'], axis=1)\n", + "display(marketing_df)" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "fb7dc6e4-87ee-488a-96d8-e99f9cb4f745", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " customer_lifetime_value\n", + "sales_channel \n", + "Agent 33057887.85\n", + "Branch 24359201.21\n", + "Call Center 17364288.37\n", + "Web 12697632.90\n" + ] + } + ], + "source": [ + "# Step 3: Total revenue by sales channel\n", + "revenue_by_channel = marketing_df.pivot_table(index='sales_channel', values='customer_lifetime_value', aggfunc='sum').round(2)\n", + "\n", + "print(revenue_by_channel)" + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "f9f08943-d99a-4f17-b18c-f7df40e15dfe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "# Step 4: Average customer lifetime value by gender and education\n", + "clv_by_gender_education = marketing_df.pivot_table(index='gender', columns='education', values='customer_lifetime_value', aggfunc='mean').round(2)\n", + "\n", + "print(clv_by_gender_education)" + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto 1 443.434952\n", + "1 Corporate Auto 2 385.208135\n", + "2 Personal Auto 1 1727.605722\n", + "3 Personal Auto 2 1453.684441\n", + "4 Special Auto 1 87.074049\n", + "5 Special Auto 2 95.226817\n" + ] + } + ], + "source": [ + "# Your code goes here\n", + "# Step 5: Extract month from date\n", + "marketing_df['month'] = pd.to_datetime(marketing_df['effective_to_date']).dt.month\n", + "\n", + "# Complaints by policy type and month, long format\n", + "complaints_summary = marketing_df.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "\n", + "print(complaints_summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d8d09e1-8f22-463c-9ac9-d3bd978cd884", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..e30623b 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,1313 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code goes here\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "df1\n", + "# df.info()\n", + "# df.dtypes\n", + "# df.nunique()\n", + "# df.describe()\n", + "# display(df.isna().sum())\n", + "# display((df.isna().sum()/df.shape[0])*100)\n", + "# df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4a50bdcb-ec5e-42ab-b5ca-ee04958d2d2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
12069LA72316CaliforniaMBachelor23405.9879871941.073.00Personal AutoFour-Door Car198.234764
12070PK87824CaliforniaFCollege3096.51121721604.079.00Corporate AutoFour-Door Car379.200000
12071TD14365CaliforniaMBachelor8163.8904280.085.03Corporate AutoFour-Door Car790.784983
12072UP19263CaliforniaMCollege7524.44243621941.096.00Personal AutoFour-Door Car691.200000
12073Y167826CaliforniaMCollege2611.8368660.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

12074 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "12069 LA72316 California M Bachelor \n", + "12070 PK87824 California F College \n", + "12071 TD14365 California M Bachelor \n", + "12072 UP19263 California M College \n", + "12073 Y167826 California M College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "12069 23405.98798 71941.0 73.0 \n", + "12070 3096.511217 21604.0 79.0 \n", + "12071 8163.890428 0.0 85.0 \n", + "12072 7524.442436 21941.0 96.0 \n", + "12073 2611.836866 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "12069 0 Personal Auto Four-Door Car \n", + "12070 0 Corporate Auto Four-Door Car \n", + "12071 3 Corporate Auto Four-Door Car \n", + "12072 0 Personal Auto Four-Door Car \n", + "12073 0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "12069 198.234764 \n", + "12070 379.200000 \n", + "12071 790.784983 \n", + "12072 691.200000 \n", + "12073 369.600000 \n", + "\n", + "[12074 rows x 11 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 2: Standardize column names\n", + "def clean_columns(df):\n", + " df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]\n", + " if 'st' in df.columns:\n", + " df.rename(columns={'st': 'state'}, inplace=True)\n", + " return df\n", + "\n", + "df1 = clean_columns(df1)\n", + "df2 = clean_columns(df2)\n", + "df3 = clean_columns(df3)\n", + "\n", + "# Step 3: Concatenate all dataframes\n", + "df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "23e40676-1b19-423d-8590-0bd11cbb5401", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 4: Clean 'gender' column (make consistent: 'M', 'F', None)\n", + "df['gender'] = df['gender'].astype(str).str.strip().str.upper().replace({'FEMALE': 'F', 'MALE': 'M', 'FEMAL': 'F', 'nan': None, 'NAN': None})\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "03d06e92-eb21-48dc-8ead-6abd1d236735", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.590.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.1748767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.180.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.6536357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59 \n", + "2 AI49188 Nevada F Bachelor 1288743.17 \n", + "3 WW63253 California M Bachelor 764586.18 \n", + "4 GA49547 Washington M High School or Below 536307.65 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 5: Clean 'customer_lifetime_value' column: remove '%' and convert to float\n", + "df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(str).str.replace('%', '', regex=False)\n", + "df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce')\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "61c55c39-51e8-40a6-b0f8-c94909fa279c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonNoneMasterNaN0.01000.01Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.590.094.01Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.1748767.0108.01Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.180.0106.01Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.6536357.068.01Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington None Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59 \n", + "2 AI49188 Nevada F Bachelor 1288743.17 \n", + "3 WW63253 California M Bachelor 764586.18 \n", + "4 GA49547 Washington M High School or Below 536307.65 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1 Personal Auto \n", + "1 0.0 94.0 1 Personal Auto \n", + "2 48767.0 108.0 1 Personal Auto \n", + "3 0.0 106.0 1 Corporate Auto \n", + "4 36357.0 68.0 1 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 6: Clean 'number_of_open_complaints' column: extract numeric value\n", + "df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract('(\\d+)')\n", + "df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').fillna(0).astype(int)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "37464a1a-c06a-4565-902c-d55884427187", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 2937\n", + "gender 3059\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 0\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(df.isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9e79ebe6-05ed-40f2-b02b-3a9eb96d7305", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0QZ44356ArizonaFBachelor6.979536e+050.094.01Personal AutoFour-Door Car1131.464935
1AI49188NevadaFBachelor1.288743e+0648767.0108.01Personal AutoTwo-Door Car566.472247
2WW63253CaliforniaMBachelor7.645862e+050.0106.01Corporate AutoSUV529.881344
3GA49547WashingtonMHigh School or Below5.363077e+0536357.068.01Personal AutoFour-Door Car17.269323
4OC83172OregonFBachelor8.256298e+0562902.069.01Personal AutoTwo-Door Car159.383042
....................................
9005LA72316CaliforniaMBachelor2.340599e+0471941.073.00Personal AutoFour-Door Car198.234764
9006PK87824CaliforniaFCollege3.096511e+0321604.079.00Corporate AutoFour-Door Car379.200000
9007TD14365CaliforniaMBachelor8.163890e+030.085.03Corporate AutoFour-Door Car790.784983
9008UP19263CaliforniaMCollege7.524442e+0321941.096.00Personal AutoFour-Door Car691.200000
9009Y167826CaliforniaMCollege2.611837e+030.077.00Corporate AutoTwo-Door Car369.600000
\n", + "

9010 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 QZ44356 Arizona F Bachelor \n", + "1 AI49188 Nevada F Bachelor \n", + "2 WW63253 California M Bachelor \n", + "3 GA49547 Washington M High School or Below \n", + "4 OC83172 Oregon F Bachelor \n", + "... ... ... ... ... \n", + "9005 LA72316 California M Bachelor \n", + "9006 PK87824 California F College \n", + "9007 TD14365 California M Bachelor \n", + "9008 UP19263 California M College \n", + "9009 Y167826 California M College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 6.979536e+05 0.0 94.0 \n", + "1 1.288743e+06 48767.0 108.0 \n", + "2 7.645862e+05 0.0 106.0 \n", + "3 5.363077e+05 36357.0 68.0 \n", + "4 8.256298e+05 62902.0 69.0 \n", + "... ... ... ... \n", + "9005 2.340599e+04 71941.0 73.0 \n", + "9006 3.096511e+03 21604.0 79.0 \n", + "9007 8.163890e+03 0.0 85.0 \n", + "9008 7.524442e+03 21941.0 96.0 \n", + "9009 2.611837e+03 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 1 Personal Auto Four-Door Car \n", + "1 1 Personal Auto Two-Door Car \n", + "2 1 Corporate Auto SUV \n", + "3 1 Personal Auto Four-Door Car \n", + "4 1 Personal Auto Two-Door Car \n", + "... ... ... ... \n", + "9005 0 Personal Auto Four-Door Car \n", + "9006 0 Corporate Auto Four-Door Car \n", + "9007 3 Corporate Auto Four-Door Car \n", + "9008 0 Personal Auto Four-Door Car \n", + "9009 0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount \n", + "0 1131.464935 \n", + "1 566.472247 \n", + "2 529.881344 \n", + "3 17.269323 \n", + "4 159.383042 \n", + "... ... \n", + "9005 198.234764 \n", + "9006 379.200000 \n", + "9007 790.784983 \n", + "9008 691.200000 \n", + "9009 369.600000 \n", + "\n", + "[9010 rows x 11 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "# Step 7: Drop rows where all columns are NaN\n", + "df.dropna(inplace=True)\n", + "\n", + "# Step 8: Reset index\n", + "df.reset_index(drop=True, inplace=True)\n", + "\n", + "# Display cleaned dataframe\n", + "df" ] }, { @@ -72,14 +1371,449 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgenderincome...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
0DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM48029...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
1KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF0...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
2LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM22139...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
3XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM49078...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
4QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF23675...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
..................................................................
10905FE99816Nevada15563.369440NoPremiumBachelor2011-01-19UnemployedF0...7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsizeA1
10906KX53892Oregon5259.444853NoBasicCollege2011-01-06EmployedF61146...6Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsizeA1
10907TL39050Arizona23893.304100NoExtendedBachelor2011-02-06EmployedF39837...2Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsizeA2
10908WA60547California11971.977650NoPremiumCollege2011-02-13EmployedF64195...6Personal AutoPersonal L1Offer1Branch618.288849SUVMedsizeA2
10909IV32877California6857.519928NoBasicBachelor2011-01-08UnemployedM0...3Personal AutoPersonal L1Offer4Web1021.719397SUVMedsizeA1
\n", + "

10910 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value response coverage \\\n", + "0 DK49336 Arizona 4809.216960 No Basic \n", + "1 KX64629 California 2228.525238 No Basic \n", + "2 LZ68649 Washington 14947.917300 No Basic \n", + "3 XL78013 Oregon 22332.439460 Yes Extended \n", + "4 QA50777 Oregon 9025.067525 No Premium \n", + "... ... ... ... ... ... \n", + "10905 FE99816 Nevada 15563.369440 No Premium \n", + "10906 KX53892 Oregon 5259.444853 No Basic \n", + "10907 TL39050 Arizona 23893.304100 No Extended \n", + "10908 WA60547 California 11971.977650 No Premium \n", + "10909 IV32877 California 6857.519928 No Basic \n", + "\n", + " education effective_to_date employmentstatus gender income ... \\\n", + "0 College 2011-02-18 Employed M 48029 ... \n", + "1 College 2011-01-18 Unemployed F 0 ... \n", + "2 Bachelor 2011-02-10 Employed M 22139 ... \n", + "3 College 2011-01-11 Employed M 49078 ... \n", + "4 Bachelor 2011-01-17 Medical Leave F 23675 ... \n", + "... ... ... ... ... ... ... \n", + "10905 Bachelor 2011-01-19 Unemployed F 0 ... \n", + "10906 College 2011-01-06 Employed F 61146 ... \n", + "10907 Bachelor 2011-02-06 Employed F 39837 ... \n", + "10908 College 2011-02-13 Employed F 64195 ... \n", + "10909 Bachelor 2011-01-08 Unemployed M 0 ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "... ... ... ... ... \n", + "10905 7 Personal Auto Personal L1 Offer3 \n", + "10906 6 Personal Auto Personal L3 Offer2 \n", + "10907 2 Corporate Auto Corporate L3 Offer1 \n", + "10908 6 Personal Auto Personal L1 Offer1 \n", + "10909 3 Personal Auto Personal L1 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "... ... ... ... ... \n", + "10905 Web 1214.400000 Luxury Car Medsize \n", + "10906 Branch 273.018929 Four-Door Car Medsize \n", + "10907 Web 381.306996 Luxury SUV Medsize \n", + "10908 Branch 618.288849 SUV Medsize \n", + "10909 Web 1021.719397 SUV Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "... ... ... \n", + "10905 A 1 \n", + "10906 A 1 \n", + "10907 A 2 \n", + "10908 A 2 \n", + "10909 A 1 \n", + "\n", + "[10910 rows x 26 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your code goes here" + "# Your code goes here\n", + "# Step 1: Load the marketing customer analysis data\n", + "marketing_df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "\n", + "# marketing_df.info()\n", + "# print()\n", + "# marketing_df.dtypes\n", + "# print()\n", + "# marketing_df.nunique()\n", + "# print()\n", + "# marketing_df.describe()\n", + "# print()\n", + "# display(marketing_df.isna().sum())\n", + "# print()\n", + "# display((marketing_df.isna().sum()/marketing_df.shape[0])*100)\n", + "print()\n", + "# df.head()\n", + "display(marketing_df.columns)\n", + "\n", + "# Step 2: Inspect and clean columns if needed\n", + "#marketing_df.columns = [c.strip().lower().replace(' ', '_') for c in marketing_df.columns]\n", + "\n", + "# Step 2: Remove column 'unnamed:_0' that seems to be the same as the implecit index\n", + "marketing_df = marketing_df.drop(['unnamed:_0'], axis=1)\n", + "display(marketing_df)" ] }, { @@ -93,6 +1827,32 @@ "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 44, + "id": "fb7dc6e4-87ee-488a-96d8-e99f9cb4f745", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " customer_lifetime_value\n", + "sales_channel \n", + "Agent 33057887.85\n", + "Branch 24359201.21\n", + "Call Center 17364288.37\n", + "Web 12697632.90\n" + ] + } + ], + "source": [ + "# Step 3: Total revenue by sales channel\n", + "revenue_by_channel = marketing_df.pivot_table(index='sales_channel', values='customer_lifetime_value', aggfunc='sum').round(2)\n", + "\n", + "print(revenue_by_channel)" + ] + }, { "cell_type": "markdown", "id": "640993b2-a291-436c-a34d-a551144f8196", @@ -103,6 +1863,30 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 45, + "id": "f9f08943-d99a-4f17-b18c-f7df40e15dfe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "# Step 4: Average customer lifetime value by gender and education\n", + "clv_by_gender_education = marketing_df.pivot_table(index='gender', columns='education', values='customer_lifetime_value', aggfunc='mean').round(2)\n", + "\n", + "print(clv_by_gender_education)" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,15 +1914,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto 1 443.434952\n", + "1 Corporate Auto 2 385.208135\n", + "2 Personal Auto 1 1727.605722\n", + "3 Personal Auto 2 1453.684441\n", + "4 Special Auto 1 87.074049\n", + "5 Special Auto 2 95.226817\n" + ] + } + ], "source": [ - "# Your code goes here" + "# Your code goes here\n", + "# Step 5: Extract month from date\n", + "marketing_df['month'] = pd.to_datetime(marketing_df['effective_to_date']).dt.month\n", + "\n", + "# Complaints by policy type and month, long format\n", + "complaints_summary = marketing_df.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "\n", + "print(complaints_summary)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d8d09e1-8f22-463c-9ac9-d3bd978cd884", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -160,7 +1973,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.6" } }, "nbformat": 4,