diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..6eb7a60 --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1286 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4d09544", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsTotal Claim AmountPolicy TypeVehicle Class
0GS98873ArizonaFBachelor323912.47%16061881/0/00633.600000Personal AutoFour-Door Car
1CW49887CaliforniaFMaster462680.11%794871141/0/00547.200000Special AutoSUV
2MY31220CaliforniaFCollege899704.02%542301121/0/00537.600000Personal AutoTwo-Door Car
3UH35128OregonFCollege2580706.30%712102141/1/001027.200000Personal AutoLuxury Car
4WH52799ArizonaFCollege380812.21%94903941/0/00451.200000Corporate AutoTwo-Door Car
....................................
991HV85198ArizonaMMaster847141.75%63513701/0/00185.667213Personal AutoFour-Door Car
992BS91566ArizonaFCollege543121.91%58161681/0/00140.747286Corporate AutoFour-Door Car
993IL40123NevadaFCollege568964.41%83640701/0/00471.050488Corporate AutoTwo-Door Car
994MY32149CaliforniaFMaster368672.38%0961/0/0028.460568Personal AutoTwo-Door Car
995SA91515CaliforniaMBachelor399258.39%01111/0/00700.349052Personal AutoSUV
\n", + "

996 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "0 GS98873 Arizona F Bachelor 323912.47% 16061 \n", + "1 CW49887 California F Master 462680.11% 79487 \n", + "2 MY31220 California F College 899704.02% 54230 \n", + "3 UH35128 Oregon F College 2580706.30% 71210 \n", + "4 WH52799 Arizona F College 380812.21% 94903 \n", + ".. ... ... ... ... ... ... \n", + "991 HV85198 Arizona M Master 847141.75% 63513 \n", + "992 BS91566 Arizona F College 543121.91% 58161 \n", + "993 IL40123 Nevada F College 568964.41% 83640 \n", + "994 MY32149 California F Master 368672.38% 0 \n", + "995 SA91515 California M Bachelor 399258.39% 0 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n", + "0 88 1/0/00 633.600000 \n", + "1 114 1/0/00 547.200000 \n", + "2 112 1/0/00 537.600000 \n", + "3 214 1/1/00 1027.200000 \n", + "4 94 1/0/00 451.200000 \n", + ".. ... ... ... \n", + "991 70 1/0/00 185.667213 \n", + "992 68 1/0/00 140.747286 \n", + "993 70 1/0/00 471.050488 \n", + "994 96 1/0/00 28.460568 \n", + "995 111 1/0/00 700.349052 \n", + "\n", + " Policy Type Vehicle Class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car \n", + ".. ... ... \n", + "991 Personal Auto Four-Door Car \n", + "992 Corporate Auto Four-Door Car \n", + "993 Corporate Auto Two-Door Car \n", + "994 Personal Auto Two-Door Car \n", + "995 Personal Auto SUV \n", + "\n", + "[996 rows x 11 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b48e9b60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerStateCustomer Lifetime ValueEducationGenderIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeTotal Claim AmountVehicle Class
0SA25987Washington3479.137523High School or BelowM01040Personal Auto499.200000Two-Door Car
1TB86706Arizona2502.637401MasterM0660Personal Auto3.468912Two-Door Car
2ZL73902Nevada3265.156348BachelorF25820820Personal Auto393.600000Four-Door Car
3KX23516California4455.843406High School or BelowF01210Personal Auto699.615192SUV
4FN77294California7704.958480High School or BelowM303661012Personal Auto484.800000SUV
....................................
7065LA72316California23405.987980BachelorM71941730Personal Auto198.234764Four-Door Car
7066PK87824California3096.511217CollegeF21604790Corporate Auto379.200000Four-Door Car
7067TD14365California8163.890428BachelorM0853Corporate Auto790.784983Four-Door Car
7068UP19263California7524.442436CollegeM21941960Personal Auto691.200000Four-Door Car
7069Y167826California2611.836866CollegeM0770Corporate Auto369.600000Two-Door Car
\n", + "

7070 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer State Customer Lifetime Value Education \\\n", + "0 SA25987 Washington 3479.137523 High School or Below \n", + "1 TB86706 Arizona 2502.637401 Master \n", + "2 ZL73902 Nevada 3265.156348 Bachelor \n", + "3 KX23516 California 4455.843406 High School or Below \n", + "4 FN77294 California 7704.958480 High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California 23405.987980 Bachelor \n", + "7066 PK87824 California 3096.511217 College \n", + "7067 TD14365 California 8163.890428 Bachelor \n", + "7068 UP19263 California 7524.442436 College \n", + "7069 Y167826 California 2611.836866 College \n", + "\n", + " Gender Income Monthly Premium Auto Number of Open Complaints \\\n", + "0 M 0 104 0 \n", + "1 M 0 66 0 \n", + "2 F 25820 82 0 \n", + "3 F 0 121 0 \n", + "4 M 30366 101 2 \n", + "... ... ... ... ... \n", + "7065 M 71941 73 0 \n", + "7066 F 21604 79 0 \n", + "7067 M 0 85 3 \n", + "7068 M 21941 96 0 \n", + "7069 M 0 77 0 \n", + "\n", + " Policy Type Total Claim Amount Vehicle Class \n", + "0 Personal Auto 499.200000 Two-Door Car \n", + "1 Personal Auto 3.468912 Two-Door Car \n", + "2 Personal Auto 393.600000 Four-Door Car \n", + "3 Personal Auto 699.615192 SUV \n", + "4 Personal Auto 484.800000 SUV \n", + "... ... ... ... \n", + "7065 Personal Auto 198.234764 Four-Door Car \n", + "7066 Corporate Auto 379.200000 Four-Door Car \n", + "7067 Corporate Auto 790.784983 Four-Door Car \n", + "7068 Personal Auto 691.200000 Four-Door Car \n", + "7069 Corporate Auto 369.600000 Two-Door Car \n", + "\n", + "[7070 rows x 11 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "35090aa7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a795af02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',\n", + " 'Income', 'Monthly Premium Auto', 'Number of Open Complaints',\n", + " 'Policy Type', 'Vehicle Class', 'Total Claim Amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0c4059ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'st', 'gender', 'education', 'customer lifetime value',\n", + " 'income', 'monthly premium auto', 'number of open complaints',\n", + " 'policy type', 'vehicle class', 'total claim amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.columns.str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43746587", + "metadata": {}, + "outputs": [], + "source": [ + "df1.rename(columns={\"old_column_name1\": \"new_column_name1\", \"old_column_name2\": \"new_column_name2\",...}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afd9cbe0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cb5ce7a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "488ffb2e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0517cc02", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [], + "source": [ + "# Your code goes here" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [], + "source": [ + "# Your code goes here" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..9c9112d 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,16 +36,3208 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, "outputs": [], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "#df1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "94b17acc", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "#df2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "86a0178e", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "#df3" + ] + }, + { + "cell_type": "markdown", + "id": "d4abcb8f", + "metadata": {}, + "source": [ + "# Cleaning df1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "38a86441", + "metadata": {}, + "outputs": [], + "source": [ + "df1.columns = df1.columns.str.lower()\n", + "df1.columns = df1.columns.str.replace(\" \", \"_\")\n", + "df1.rename(columns={\"st\":\"state\"}, inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ff7995dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 2937\n", + "gender 3054\n", + "education 2937\n", + "customer_lifetime_value 2940\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 2937\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"customer_lifetime_value\"] = df1[\"customer_lifetime_value\"].str.strip(\"%\")\n", + "df1[\"customer_lifetime_value\"] = pd.to_numeric(df1[\"customer_lifetime_value\"], errors=\"coerce\")\n", + "df1.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "17a4ff8c", + "metadata": {}, + "outputs": [], + "source": [ + "df1[\"number_of_open_complaints\"] = df1[\"number_of_open_complaints\"].str.split(\"/\", expand=True)[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fb710d08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "gender 117\n", + "education 0\n", + "customer_lifetime_value 3\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "policy_type 0\n", + "vehicle_class 0\n", + "total_claim_amount 0\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df1.dropna(how=\"all\")\n", + "df1.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0315ffe2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "gender 0\n", + "education 0\n", + "customer_lifetime_value 0\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "policy_type 0\n", + "vehicle_class 0\n", + "total_claim_amount 0\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"gender\"] = df1[\"gender\"].fillna(\"unknown\")\n", + "df1[\"customer_lifetime_value\"] = df1[\"customer_lifetime_value\"].fillna(0)\n", + "df1.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6773cc52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonunknownMaster0.000.01000.00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.590.094.00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.1748767.0108.00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.180.0106.00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.6536357.068.00Personal AutoFour-Door Car17.269323
....................................
1066TM65736OregonMMaster305955.0338644.078.01Personal AutoFour-Door Car361.455219
1067VJ51327CaliFHigh School or Below2031499.7663209.0102.02Personal AutoSUV207.320041
1068GS98873ArizonaFBachelor323912.4716061.088.00Personal AutoFour-Door Car633.600000
1069CW49887CaliforniaFMaster462680.1179487.0114.00Special AutoSUV547.200000
1070MY31220CaliforniaFCollege899704.0254230.0112.00Personal AutoTwo-Door Car537.600000
\n", + "

1071 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 Washington unknown Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "1066 TM65736 Oregon M Master \n", + "1067 VJ51327 Cali F High School or Below \n", + "1068 GS98873 Arizona F Bachelor \n", + "1069 CW49887 California F Master \n", + "1070 MY31220 California F College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.00 0.0 1000.0 \n", + "1 697953.59 0.0 94.0 \n", + "2 1288743.17 48767.0 108.0 \n", + "3 764586.18 0.0 106.0 \n", + "4 536307.65 36357.0 68.0 \n", + "... ... ... ... \n", + "1066 305955.03 38644.0 78.0 \n", + "1067 2031499.76 63209.0 102.0 \n", + "1068 323912.47 16061.0 88.0 \n", + "1069 462680.11 79487.0 114.0 \n", + "1070 899704.02 54230.0 112.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0 Personal Auto Four-Door Car \n", + "1 0 Personal Auto Four-Door Car \n", + "2 0 Personal Auto Two-Door Car \n", + "3 0 Corporate Auto SUV \n", + "4 0 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "1066 1 Personal Auto Four-Door Car \n", + "1067 2 Personal Auto SUV \n", + "1068 0 Personal Auto Four-Door Car \n", + "1069 0 Special Auto SUV \n", + "1070 0 Personal Auto Two-Door Car \n", + "\n", + " total_claim_amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "1066 361.455219 \n", + "1067 207.320041 \n", + "1068 633.600000 \n", + "1069 547.200000 \n", + "1070 537.600000 \n", + "\n", + "[1071 rows x 11 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "94604f47", + "metadata": {}, + "outputs": [], + "source": [ + "text_columns = [\"gender\", \"education\", \"policy_type\", \"vehicle_class\", \"state\"]\n", + "for column in text_columns:\n", + " df1[column] = df1[column].str.lower().str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "49420406", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392washingtonunknownmaster0.000.01000.00personal autofour-door car2.704934
1QZ44356arizonafbachelor697953.590.094.00personal autofour-door car1131.464935
2AI49188nevadafbachelor1288743.1748767.0108.00personal autotwo-door car566.472247
3WW63253californiambachelor764586.180.0106.00corporate autosuv529.881344
4GA49547washingtonmhigh school or below536307.6536357.068.00personal autofour-door car17.269323
....................................
1066TM65736oregonmmaster305955.0338644.078.01personal autofour-door car361.455219
1067VJ51327califhigh school or below2031499.7663209.0102.02personal autosuv207.320041
1068GS98873arizonafbachelor323912.4716061.088.00personal autofour-door car633.600000
1069CW49887californiafmaster462680.1179487.0114.00special autosuv547.200000
1070MY31220californiafcollege899704.0254230.0112.00personal autotwo-door car537.600000
\n", + "

1071 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 washington unknown master \n", + "1 QZ44356 arizona f bachelor \n", + "2 AI49188 nevada f bachelor \n", + "3 WW63253 california m bachelor \n", + "4 GA49547 washington m high school or below \n", + "... ... ... ... ... \n", + "1066 TM65736 oregon m master \n", + "1067 VJ51327 cali f high school or below \n", + "1068 GS98873 arizona f bachelor \n", + "1069 CW49887 california f master \n", + "1070 MY31220 california f college \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.00 0.0 1000.0 \n", + "1 697953.59 0.0 94.0 \n", + "2 1288743.17 48767.0 108.0 \n", + "3 764586.18 0.0 106.0 \n", + "4 536307.65 36357.0 68.0 \n", + "... ... ... ... \n", + "1066 305955.03 38644.0 78.0 \n", + "1067 2031499.76 63209.0 102.0 \n", + "1068 323912.47 16061.0 88.0 \n", + "1069 462680.11 79487.0 114.0 \n", + "1070 899704.02 54230.0 112.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0 personal auto four-door car \n", + "1 0 personal auto four-door car \n", + "2 0 personal auto two-door car \n", + "3 0 corporate auto suv \n", + "4 0 personal auto four-door car \n", + "... ... ... ... \n", + "1066 1 personal auto four-door car \n", + "1067 2 personal auto suv \n", + "1068 0 personal auto four-door car \n", + "1069 0 special auto suv \n", + "1070 0 personal auto two-door car \n", + "\n", + " total_claim_amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "1066 361.455219 \n", + "1067 207.320041 \n", + "1068 633.600000 \n", + "1069 547.200000 \n", + "1070 537.600000 \n", + "\n", + "[1071 rows x 11 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "688c5dbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['unknown', 'female', 'male'], dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"gender\"] = df1[\"gender\"].replace({\"m\": \"male\", \"f\": \"female\", \"femal\": \"female\"})\n", + "df1[\"gender\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e8d2c1e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['four-door car', 'two-door car', 'suv', 'luxury'], dtype=object)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"vehicle_class\"] = df1[\"vehicle_class\"].replace({\"luxury suv\": \"luxury\", \"luxury car\": \"luxury\", \"sports car\": \"luxury\", \"luxery\": \"luxury\" })\n", + "df1[\"vehicle_class\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e9c343c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['washington', 'arizona', 'nevada', 'california', 'oregon'],\n", + " dtype=object)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"state\"] = df1[\"state\"].replace({\"cali\": \"california\", \"az\": \"arizona\", \"wa\": \"washington\"})\n", + "df1[\"state\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bae40876", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['master', 'bachelor', 'high school or below', 'college', 'doctor'],\n", + " dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#df1[\"education\"].unique()\n", + "df1[\"education\"] = df1[\"education\"].replace({\"bachelors\": \"bachelor\"})\n", + "df1[\"education\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3ce9d3ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['personal auto', 'corporate auto', 'special auto'], dtype=object)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1[\"policy_type\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8f1cc493", + "metadata": {}, + "outputs": [], + "source": [ + "df1[\"income\"] = df1[\"income\"].astype(int)\n", + "df1[\"monthly_premium_auto\"] = df1[\"monthly_premium_auto\"].astype(int)\n", + "df1[\"number_of_open_complaints\"] = df1[\"number_of_open_complaints\"].astype(int)\n", + "df1[\"customer_lifetime_value\"] = df1[\"customer_lifetime_value\"].round(2)\n", + "df1[\"total_claim_amount\"] = df1[\"total_claim_amount\"].round(2)" ] }, + { + "cell_type": "markdown", + "id": "9d7e96f8", + "metadata": {}, + "source": [ + "# Cleaning df2" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7d38baae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsTotal Claim AmountPolicy TypeVehicle Class
0GS98873ArizonaFBachelor323912.47%16061881/0/00633.600000Personal AutoFour-Door Car
1CW49887CaliforniaFMaster462680.11%794871141/0/00547.200000Special AutoSUV
2MY31220CaliforniaFCollege899704.02%542301121/0/00537.600000Personal AutoTwo-Door Car
3UH35128OregonFCollege2580706.30%712102141/1/001027.200000Personal AutoLuxury Car
4WH52799ArizonaFCollege380812.21%94903941/0/00451.200000Corporate AutoTwo-Door Car
....................................
991HV85198ArizonaMMaster847141.75%63513701/0/00185.667213Personal AutoFour-Door Car
992BS91566ArizonaFCollege543121.91%58161681/0/00140.747286Corporate AutoFour-Door Car
993IL40123NevadaFCollege568964.41%83640701/0/00471.050488Corporate AutoTwo-Door Car
994MY32149CaliforniaFMaster368672.38%0961/0/0028.460568Personal AutoTwo-Door Car
995SA91515CaliforniaMBachelor399258.39%01111/0/00700.349052Personal AutoSUV
\n", + "

996 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "0 GS98873 Arizona F Bachelor 323912.47% 16061 \n", + "1 CW49887 California F Master 462680.11% 79487 \n", + "2 MY31220 California F College 899704.02% 54230 \n", + "3 UH35128 Oregon F College 2580706.30% 71210 \n", + "4 WH52799 Arizona F College 380812.21% 94903 \n", + ".. ... ... ... ... ... ... \n", + "991 HV85198 Arizona M Master 847141.75% 63513 \n", + "992 BS91566 Arizona F College 543121.91% 58161 \n", + "993 IL40123 Nevada F College 568964.41% 83640 \n", + "994 MY32149 California F Master 368672.38% 0 \n", + "995 SA91515 California M Bachelor 399258.39% 0 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n", + "0 88 1/0/00 633.600000 \n", + "1 114 1/0/00 547.200000 \n", + "2 112 1/0/00 537.600000 \n", + "3 214 1/1/00 1027.200000 \n", + "4 94 1/0/00 451.200000 \n", + ".. ... ... ... \n", + "991 70 1/0/00 185.667213 \n", + "992 68 1/0/00 140.747286 \n", + "993 70 1/0/00 471.050488 \n", + "994 96 1/0/00 28.460568 \n", + "995 111 1/0/00 700.349052 \n", + "\n", + " Policy Type Vehicle Class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car \n", + ".. ... ... \n", + "991 Personal Auto Four-Door Car \n", + "992 Corporate Auto Four-Door Car \n", + "993 Corporate Auto Two-Door Car \n", + "994 Personal Auto Two-Door Car \n", + "995 Personal Auto SUV \n", + "\n", + "[996 rows x 11 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "096441e7", + "metadata": {}, + "outputs": [], + "source": [ + "df2.columns = df2.columns.str.lower()\n", + "df2.columns = df2.columns.str.replace(\" \", \"_\")\n", + "df2.rename(columns={\"st\":\"state\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2a6d8955", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "gender 5\n", + "education 0\n", + "customer_lifetime_value 0\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "total_claim_amount 0\n", + "policy_type 0\n", + "vehicle_class 0\n", + "dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"customer_lifetime_value\"] = df2[\"customer_lifetime_value\"].str.strip(\"%\")\n", + "df2[\"customer_lifetime_value\"] = pd.to_numeric(df1[\"customer_lifetime_value\"], errors=\"coerce\")\n", + "df2.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "0ddeb913", + "metadata": {}, + "outputs": [], + "source": [ + "df2[\"gender\"] = df2[\"gender\"].fillna(\"unknown\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5167588d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintstotal_claim_amountpolicy_typevehicle_class
0GS98873ArizonaFBachelor0.0016061881/0/00633.600000Personal AutoFour-Door Car
1CW49887CaliforniaFMaster697953.59794871141/0/00547.200000Special AutoSUV
2MY31220CaliforniaFCollege1288743.17542301121/0/00537.600000Personal AutoTwo-Door Car
3UH35128OregonFCollege764586.18712102141/1/001027.200000Personal AutoLuxury Car
4WH52799ArizonaFCollege536307.6594903941/0/00451.200000Corporate AutoTwo-Door Car
....................................
991HV85198ArizonaMMaster2153133.2863513701/0/00185.667213Personal AutoFour-Door Car
992BS91566ArizonaFCollege1262283.2758161681/0/00140.747286Corporate AutoFour-Door Car
993IL40123NevadaFCollege2017196.1583640701/0/00471.050488Corporate AutoTwo-Door Car
994MY32149CaliforniaFMaster1646436.590961/0/0028.460568Personal AutoTwo-Door Car
995SA91515CaliforniaMBachelor559538.9901111/0/00700.349052Personal AutoSUV
\n", + "

996 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value income \\\n", + "0 GS98873 Arizona F Bachelor 0.00 16061 \n", + "1 CW49887 California F Master 697953.59 79487 \n", + "2 MY31220 California F College 1288743.17 54230 \n", + "3 UH35128 Oregon F College 764586.18 71210 \n", + "4 WH52799 Arizona F College 536307.65 94903 \n", + ".. ... ... ... ... ... ... \n", + "991 HV85198 Arizona M Master 2153133.28 63513 \n", + "992 BS91566 Arizona F College 1262283.27 58161 \n", + "993 IL40123 Nevada F College 2017196.15 83640 \n", + "994 MY32149 California F Master 1646436.59 0 \n", + "995 SA91515 California M Bachelor 559538.99 0 \n", + "\n", + " monthly_premium_auto number_of_open_complaints total_claim_amount \\\n", + "0 88 1/0/00 633.600000 \n", + "1 114 1/0/00 547.200000 \n", + "2 112 1/0/00 537.600000 \n", + "3 214 1/1/00 1027.200000 \n", + "4 94 1/0/00 451.200000 \n", + ".. ... ... ... \n", + "991 70 1/0/00 185.667213 \n", + "992 68 1/0/00 140.747286 \n", + "993 70 1/0/00 471.050488 \n", + "994 96 1/0/00 28.460568 \n", + "995 111 1/0/00 700.349052 \n", + "\n", + " policy_type vehicle_class \n", + "0 Personal Auto Four-Door Car \n", + "1 Special Auto SUV \n", + "2 Personal Auto Two-Door Car \n", + "3 Personal Auto Luxury Car \n", + "4 Corporate Auto Two-Door Car \n", + ".. ... ... \n", + "991 Personal Auto Four-Door Car \n", + "992 Corporate Auto Four-Door Car \n", + "993 Corporate Auto Two-Door Car \n", + "994 Personal Auto Two-Door Car \n", + "995 Personal Auto SUV \n", + "\n", + "[996 rows x 11 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "daa64953", + "metadata": {}, + "outputs": [], + "source": [ + "text_columns = [\"gender\", \"education\", \"policy_type\", \"vehicle_class\", \"state\"]\n", + "for column in text_columns:\n", + " df2[column] = df1[column].str.lower().str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c3394711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['washington', 'arizona', 'nevada', 'california', 'oregon'],\n", + " dtype=object)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"state\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "3198eaaa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['unknown', 'female', 'male'], dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"gender\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "301f87ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['master', 'bachelor', 'high school or below', 'college', 'doctor'],\n", + " dtype=object)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"education\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "26e27e09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['personal auto', 'corporate auto', 'special auto'], dtype=object)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"policy_type\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "88857119", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['four-door car', 'two-door car', 'suv', 'luxury'], dtype=object)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"vehicle_class\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "30f064b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['1/0/00', '1/1/00', '1/3/00', '1/5/00', '1/2/00', '1/4/00'],\n", + " dtype=object)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"number_of_open_complaints\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "845e7522", + "metadata": {}, + "outputs": [], + "source": [ + "df2[\"number_of_open_complaints\"] = df2[\"number_of_open_complaints\"].str.split(\"/\", expand=True)[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7af46c3d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['0', '1', '3', '5', '2', '4'], dtype=object)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2[\"number_of_open_complaints\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8c40edf7", + "metadata": {}, + "outputs": [], + "source": [ + "df2[\"number_of_open_complaints\"] = df2[\"number_of_open_complaints\"].astype(int)\n", + "df2[\"customer_lifetime_value\"] = df2[\"customer_lifetime_value\"].round(2)\n", + "df2[\"total_claim_amount\"] = df2[\"total_claim_amount\"].round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e2174417", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintstotal_claim_amountpolicy_typevehicle_class
0GS98873washingtonunknownmaster0.0016061880633.60personal autofour-door car
1CW49887arizonafemalebachelor697953.59794871140547.20personal autofour-door car
2MY31220nevadafemalebachelor1288743.17542301120537.60personal autotwo-door car
3UH35128californiamalebachelor764586.187121021411027.20corporate autosuv
4WH52799washingtonmalehigh school or below536307.6594903940451.20personal autofour-door car
....................................
991HV85198oregonfemalehigh school or below2153133.2863513700185.67personal autofour-door car
992BS91566arizonamalebachelor1262283.2758161680140.75personal autoluxury
993IL40123arizonamaledoctor2017196.1583640700471.05personal autotwo-door car
994MY32149californiafemalebachelor1646436.59096028.46personal autosuv
995SA91515oregonfemalehigh school or below559538.9901110700.35personal autofour-door car
\n", + "

996 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 GS98873 washington unknown master \n", + "1 CW49887 arizona female bachelor \n", + "2 MY31220 nevada female bachelor \n", + "3 UH35128 california male bachelor \n", + "4 WH52799 washington male high school or below \n", + ".. ... ... ... ... \n", + "991 HV85198 oregon female high school or below \n", + "992 BS91566 arizona male bachelor \n", + "993 IL40123 arizona male doctor \n", + "994 MY32149 california female bachelor \n", + "995 SA91515 oregon female high school or below \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.00 16061 88 \n", + "1 697953.59 79487 114 \n", + "2 1288743.17 54230 112 \n", + "3 764586.18 71210 214 \n", + "4 536307.65 94903 94 \n", + ".. ... ... ... \n", + "991 2153133.28 63513 70 \n", + "992 1262283.27 58161 68 \n", + "993 2017196.15 83640 70 \n", + "994 1646436.59 0 96 \n", + "995 559538.99 0 111 \n", + "\n", + " number_of_open_complaints total_claim_amount policy_type \\\n", + "0 0 633.60 personal auto \n", + "1 0 547.20 personal auto \n", + "2 0 537.60 personal auto \n", + "3 1 1027.20 corporate auto \n", + "4 0 451.20 personal auto \n", + ".. ... ... ... \n", + "991 0 185.67 personal auto \n", + "992 0 140.75 personal auto \n", + "993 0 471.05 personal auto \n", + "994 0 28.46 personal auto \n", + "995 0 700.35 personal auto \n", + "\n", + " vehicle_class \n", + "0 four-door car \n", + "1 four-door car \n", + "2 two-door car \n", + "3 suv \n", + "4 four-door car \n", + ".. ... \n", + "991 four-door car \n", + "992 luxury \n", + "993 two-door car \n", + "994 suv \n", + "995 four-door car \n", + "\n", + "[996 rows x 11 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "54b16d91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "gender 0\n", + "education 0\n", + "customer_lifetime_value 0\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "total_claim_amount 0\n", + "policy_type 0\n", + "vehicle_class 0\n", + "dtype: int64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "1a7fab3b", + "metadata": {}, + "source": [ + "# Cleaning df3" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "377f69cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerStateCustomer Lifetime ValueEducationGenderIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeTotal Claim AmountVehicle Class
0SA25987Washington3479.137523High School or BelowM01040Personal Auto499.200000Two-Door Car
1TB86706Arizona2502.637401MasterM0660Personal Auto3.468912Two-Door Car
2ZL73902Nevada3265.156348BachelorF25820820Personal Auto393.600000Four-Door Car
3KX23516California4455.843406High School or BelowF01210Personal Auto699.615192SUV
4FN77294California7704.958480High School or BelowM303661012Personal Auto484.800000SUV
....................................
7065LA72316California23405.987980BachelorM71941730Personal Auto198.234764Four-Door Car
7066PK87824California3096.511217CollegeF21604790Corporate Auto379.200000Four-Door Car
7067TD14365California8163.890428BachelorM0853Corporate Auto790.784983Four-Door Car
7068UP19263California7524.442436CollegeM21941960Personal Auto691.200000Four-Door Car
7069Y167826California2611.836866CollegeM0770Corporate Auto369.600000Two-Door Car
\n", + "

7070 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer State Customer Lifetime Value Education \\\n", + "0 SA25987 Washington 3479.137523 High School or Below \n", + "1 TB86706 Arizona 2502.637401 Master \n", + "2 ZL73902 Nevada 3265.156348 Bachelor \n", + "3 KX23516 California 4455.843406 High School or Below \n", + "4 FN77294 California 7704.958480 High School or Below \n", + "... ... ... ... ... \n", + "7065 LA72316 California 23405.987980 Bachelor \n", + "7066 PK87824 California 3096.511217 College \n", + "7067 TD14365 California 8163.890428 Bachelor \n", + "7068 UP19263 California 7524.442436 College \n", + "7069 Y167826 California 2611.836866 College \n", + "\n", + " Gender Income Monthly Premium Auto Number of Open Complaints \\\n", + "0 M 0 104 0 \n", + "1 M 0 66 0 \n", + "2 F 25820 82 0 \n", + "3 F 0 121 0 \n", + "4 M 30366 101 2 \n", + "... ... ... ... ... \n", + "7065 M 71941 73 0 \n", + "7066 F 21604 79 0 \n", + "7067 M 0 85 3 \n", + "7068 M 21941 96 0 \n", + "7069 M 0 77 0 \n", + "\n", + " Policy Type Total Claim Amount Vehicle Class \n", + "0 Personal Auto 499.200000 Two-Door Car \n", + "1 Personal Auto 3.468912 Two-Door Car \n", + "2 Personal Auto 393.600000 Four-Door Car \n", + "3 Personal Auto 699.615192 SUV \n", + "4 Personal Auto 484.800000 SUV \n", + "... ... ... ... \n", + "7065 Personal Auto 198.234764 Four-Door Car \n", + "7066 Corporate Auto 379.200000 Four-Door Car \n", + "7067 Corporate Auto 790.784983 Four-Door Car \n", + "7068 Personal Auto 691.200000 Four-Door Car \n", + "7069 Corporate Auto 369.600000 Two-Door Car \n", + "\n", + "[7070 rows x 11 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "17283368", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Customer', 'State', 'Customer Lifetime Value', 'Education', 'Gender',\n", + " 'Income', 'Monthly Premium Auto', 'Number of Open Complaints',\n", + " 'Policy Type', 'Total Claim Amount', 'Vehicle Class'],\n", + " dtype='object')" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c964a69b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'customer_lifetime_value', 'education', 'gender',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'total_claim_amount', 'vehicle_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns = (df3.columns.str.strip().str.lower().str.replace(\" \", \"_\"))\n", + "df3 = df3.rename(columns={\"st\": \"state\"})\n", + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "eefd8d37", + "metadata": {}, + "outputs": [], + "source": [ + "text_columns = [\"gender\", \"education\", \"policy_type\", \"vehicle_class\", \"state\"]\n", + "for column in text_columns:\n", + " df3[column] = df3[column].str.lower().str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "d1005163", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueeducationgenderincomemonthly_premium_autonumber_of_open_complaintspolicy_typetotal_claim_amountvehicle_class
0SA25987washington3479.137523high school or belowm01040personal auto499.200000two-door car
1TB86706arizona2502.637401masterm0660personal auto3.468912two-door car
2ZL73902nevada3265.156348bachelorf25820820personal auto393.600000four-door car
3KX23516california4455.843406high school or belowf01210personal auto699.615192suv
4FN77294california7704.958480high school or belowm303661012personal auto484.800000suv
....................................
7065LA72316california23405.987980bachelorm71941730personal auto198.234764four-door car
7066PK87824california3096.511217collegef21604790corporate auto379.200000four-door car
7067TD14365california8163.890428bachelorm0853corporate auto790.784983four-door car
7068UP19263california7524.442436collegem21941960personal auto691.200000four-door car
7069Y167826california2611.836866collegem0770corporate auto369.600000two-door car
\n", + "

7070 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value education \\\n", + "0 SA25987 washington 3479.137523 high school or below \n", + "1 TB86706 arizona 2502.637401 master \n", + "2 ZL73902 nevada 3265.156348 bachelor \n", + "3 KX23516 california 4455.843406 high school or below \n", + "4 FN77294 california 7704.958480 high school or below \n", + "... ... ... ... ... \n", + "7065 LA72316 california 23405.987980 bachelor \n", + "7066 PK87824 california 3096.511217 college \n", + "7067 TD14365 california 8163.890428 bachelor \n", + "7068 UP19263 california 7524.442436 college \n", + "7069 Y167826 california 2611.836866 college \n", + "\n", + " gender income monthly_premium_auto number_of_open_complaints \\\n", + "0 m 0 104 0 \n", + "1 m 0 66 0 \n", + "2 f 25820 82 0 \n", + "3 f 0 121 0 \n", + "4 m 30366 101 2 \n", + "... ... ... ... ... \n", + "7065 m 71941 73 0 \n", + "7066 f 21604 79 0 \n", + "7067 m 0 85 3 \n", + "7068 m 21941 96 0 \n", + "7069 m 0 77 0 \n", + "\n", + " policy_type total_claim_amount vehicle_class \n", + "0 personal auto 499.200000 two-door car \n", + "1 personal auto 3.468912 two-door car \n", + "2 personal auto 393.600000 four-door car \n", + "3 personal auto 699.615192 suv \n", + "4 personal auto 484.800000 suv \n", + "... ... ... ... \n", + "7065 personal auto 198.234764 four-door car \n", + "7066 corporate auto 379.200000 four-door car \n", + "7067 corporate auto 790.784983 four-door car \n", + "7068 personal auto 691.200000 four-door car \n", + "7069 corporate auto 369.600000 two-door car \n", + "\n", + "[7070 rows x 11 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "48018657", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([3479.137523, 2502.637401, 3265.156348, ..., 8163.890428,\n", + " 7524.442436, 2611.836866])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3[\"vehicle_class\"] = df3[\"vehicle_class\"].replace({\"luxury suv\": \"luxury\", \"luxury car\": \"luxury\", \"sports car\": \"luxury\", \"luxery\": \"luxury\" })\n", + "df3[\"customer_lifetime_value\"].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "17ba33fe", + "metadata": {}, + "outputs": [], + "source": [ + "df3[\"number_of_open_complaints\"] = df3[\"number_of_open_complaints\"].astype(int)\n", + "df3[\"customer_lifetime_value\"] = df3[\"customer_lifetime_value\"].round(2)\n", + "df3[\"total_claim_amount\"] = df3[\"total_claim_amount\"].round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "6e6b8ca0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "customer_lifetime_value 0\n", + "education 0\n", + "gender 0\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "policy_type 0\n", + "total_claim_amount 0\n", + "vehicle_class 0\n", + "dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "95d6e7fa", + "metadata": {}, + "source": [ + "## Last checks before merging" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "41b9ddbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "8d871ec0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'total_claim_amount', 'policy_type', 'vehicle_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "217fcbfa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'customer_lifetime_value', 'education', 'gender',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'total_claim_amount', 'vehicle_class'],\n", + " dtype='object')" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "af4e7429", + "metadata": {}, + "outputs": [], + "source": [ + "cols = df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2a68032b", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df2[cols]\n", + "df3 = df3[cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "c4afa62d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "884587cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "1473ee10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "c09a78a3", + "metadata": {}, + "outputs": [], + "source": [ + "comb_df = pd.concat([df1, df2, df3], axis=0).reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "6f49d6b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392washingtonunknownmaster0.00010000personal autofour-door car2.70
1QZ44356arizonafemalebachelor697953.590940personal autofour-door car1131.46
2AI49188nevadafemalebachelor1288743.17487671080personal autotwo-door car566.47
3WW63253californiamalebachelor764586.1801060corporate autosuv529.88
4GA49547washingtonmalehigh school or below536307.6536357680personal autofour-door car17.27
....................................
9132LA72316californiambachelor23405.9971941730personal autofour-door car198.23
9133PK87824californiafcollege3096.5121604790corporate autofour-door car379.20
9134TD14365californiambachelor8163.890853corporate autofour-door car790.78
9135UP19263californiamcollege7524.4421941960personal autofour-door car691.20
9136Y167826californiamcollege2611.840770corporate autotwo-door car369.60
\n", + "

9137 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 washington unknown master \n", + "1 QZ44356 arizona female bachelor \n", + "2 AI49188 nevada female bachelor \n", + "3 WW63253 california male bachelor \n", + "4 GA49547 washington male high school or below \n", + "... ... ... ... ... \n", + "9132 LA72316 california m bachelor \n", + "9133 PK87824 california f college \n", + "9134 TD14365 california m bachelor \n", + "9135 UP19263 california m college \n", + "9136 Y167826 california m college \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.00 0 1000 \n", + "1 697953.59 0 94 \n", + "2 1288743.17 48767 108 \n", + "3 764586.18 0 106 \n", + "4 536307.65 36357 68 \n", + "... ... ... ... \n", + "9132 23405.99 71941 73 \n", + "9133 3096.51 21604 79 \n", + "9134 8163.89 0 85 \n", + "9135 7524.44 21941 96 \n", + "9136 2611.84 0 77 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0 personal auto four-door car \n", + "1 0 personal auto four-door car \n", + "2 0 personal auto two-door car \n", + "3 0 corporate auto suv \n", + "4 0 personal auto four-door car \n", + "... ... ... ... \n", + "9132 0 personal auto four-door car \n", + "9133 0 corporate auto four-door car \n", + "9134 3 corporate auto four-door car \n", + "9135 0 personal auto four-door car \n", + "9136 0 corporate auto two-door car \n", + "\n", + " total_claim_amount \n", + "0 2.70 \n", + "1 1131.46 \n", + "2 566.47 \n", + "3 529.88 \n", + "4 17.27 \n", + "... ... \n", + "9132 198.23 \n", + "9133 379.20 \n", + "9134 790.78 \n", + "9135 691.20 \n", + "9136 369.60 \n", + "\n", + "[9137 rows x 11 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comb_df" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "f111a761", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(9137, 11)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comb_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "0d8ddfd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392washingtonunknownmaster0.00010000personal autofour-door car2.70
1QZ44356arizonafemalebachelor697953.590940personal autofour-door car1131.46
2AI49188nevadafemalebachelor1288743.17487671080personal autotwo-door car566.47
3WW63253californiamalebachelor764586.1801060corporate autosuv529.88
4GA49547washingtonmalehigh school or below536307.6536357680personal autofour-door car17.27
\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 washington unknown master \n", + "1 QZ44356 arizona female bachelor \n", + "2 AI49188 nevada female bachelor \n", + "3 WW63253 california male bachelor \n", + "4 GA49547 washington male high school or below \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 0.00 0 1000 \n", + "1 697953.59 0 94 \n", + "2 1288743.17 48767 108 \n", + "3 764586.18 0 106 \n", + "4 536307.65 36357 68 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0 personal auto four-door car \n", + "1 0 personal auto four-door car \n", + "2 0 personal auto two-door car \n", + "3 0 corporate auto suv \n", + "4 0 personal auto four-door car \n", + "\n", + " total_claim_amount \n", + "0 2.70 \n", + "1 1131.46 \n", + "2 566.47 \n", + "3 529.88 \n", + "4 17.27 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comb_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "cb0dd902", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer object\n", + "state object\n", + "gender object\n", + "education object\n", + "customer_lifetime_value float64\n", + "income int64\n", + "monthly_premium_auto int64\n", + "number_of_open_complaints int64\n", + "policy_type object\n", + "vehicle_class object\n", + "total_claim_amount float64\n", + "dtype: object" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comb_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4a003cd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89b38da", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17742cba", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", @@ -72,14 +3264,505 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, "outputs": [], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "ffacfc9f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
..................................................................
1090510905FE99816Nevada15563.369440NoPremiumBachelor2011-01-19UnemployedF...7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsizeA1
1090610906KX53892Oregon5259.444853NoBasicCollege2011-01-06EmployedF...6Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsizeA1
1090710907TL39050Arizona23893.304100NoExtendedBachelor2011-02-06EmployedF...2Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsizeA2
1090810908WA60547California11971.977650NoPremiumCollege2011-02-13EmployedF...6Personal AutoPersonal L1Offer1Branch618.288849SUVMedsizeA2
1090910909IV32877California6857.519928NoBasicBachelor2011-01-08UnemployedM...3Personal AutoPersonal L1Offer4Web1021.719397SUVMedsizeA1
\n", + "

10910 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "... ... ... ... ... ... \n", + "10905 10905 FE99816 Nevada 15563.369440 No \n", + "10906 10906 KX53892 Oregon 5259.444853 No \n", + "10907 10907 TL39050 Arizona 23893.304100 No \n", + "10908 10908 WA60547 California 11971.977650 No \n", + "10909 10909 IV32877 California 6857.519928 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "... ... ... ... ... ... ... \n", + "10905 Premium Bachelor 2011-01-19 Unemployed F ... \n", + "10906 Basic College 2011-01-06 Employed F ... \n", + "10907 Extended Bachelor 2011-02-06 Employed F ... \n", + "10908 Premium College 2011-02-13 Employed F ... \n", + "10909 Basic Bachelor 2011-01-08 Unemployed M ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "... ... ... ... ... \n", + "10905 7 Personal Auto Personal L1 Offer3 \n", + "10906 6 Personal Auto Personal L3 Offer2 \n", + "10907 2 Corporate Auto Corporate L3 Offer1 \n", + "10908 6 Personal Auto Personal L1 Offer1 \n", + "10909 3 Personal Auto Personal L1 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "... ... ... ... ... \n", + "10905 Web 1214.400000 Luxury Car Medsize \n", + "10906 Branch 273.018929 Four-Door Car Medsize \n", + "10907 Web 381.306996 Luxury SUV Medsize \n", + "10908 Branch 618.288849 SUV Medsize \n", + "10909 Web 1021.719397 SUV Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "... ... ... \n", + "10905 A 1 \n", + "10906 A 1 \n", + "10907 A 2 \n", + "10908 A 2 \n", + "10909 A 1 \n", + "\n", + "[10910 rows x 27 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "3f630515", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Arizona', 'California', 'Washington', 'Oregon', 'Nevada'],\n", + " dtype=object)" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"state\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "7bfdd4f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "unnamed:_0 int64\n", + "customer object\n", + "state object\n", + "customer_lifetime_value float64\n", + "response object\n", + "coverage object\n", + "education object\n", + "effective_to_date object\n", + "employmentstatus object\n", + "gender object\n", + "income int64\n", + "location_code object\n", + "marital_status object\n", + "monthly_premium_auto int64\n", + "months_since_last_claim float64\n", + "months_since_policy_inception int64\n", + "number_of_open_complaints float64\n", + "number_of_policies int64\n", + "policy_type object\n", + "policy object\n", + "renew_offer_type object\n", + "sales_channel object\n", + "total_claim_amount float64\n", + "vehicle_class object\n", + "vehicle_size object\n", + "vehicle_type object\n", + "month int64\n", + "dtype: object" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "a8cccc6e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Agent', 'Call Center', 'Branch', 'Web'], dtype=object)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"sales_channel\"].unique()" ] }, { @@ -93,6 +3776,89 @@ "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 61, + "id": "68f9b972", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_sales_channel = df.pivot_table(index=\"sales_channel\", values=\"total_claim_amount\", aggfunc='sum')\n", + "pivot_sales_channel = pivot_sales_channel.round(2)\n", + "pivot_sales_channel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "900557e2", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "640993b2-a291-436c-a34d-a551144f8196", @@ -103,6 +3869,103 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 64, + "id": "803a15bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMaster
gender
F7874.277748.827328.518675.228157.05
M7703.608052.467415.338149.698168.83
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_lifetime_value = df.pivot_table(index=\"gender\", columns=\"education\", values=\"customer_lifetime_value\",aggfunc=\"mean\")\n", + "pivot_lifetime_value = pivot_lifetime_value.round(2)\n", + "pivot_lifetime_value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a7cef6e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d64c57dc", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,7 +3993,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" @@ -160,7 +4023,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.4" } }, "nbformat": 4,