diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..f093a47 --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1442 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4008, 11) (996, 11) (7070, 11)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "\n", + "print(df1.shape, df2.shape, df3.shape)\n", + "#df1.head()\n", + "#df2.head()\n", + "#df3.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d669a282-5584-49aa-9db6-39f49e0c9139", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(12074, 13)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim AmountStateGender
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323NaNNaN
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount State Gender \n", + "0 Four-Door Car 2.704934 NaN NaN \n", + "1 Four-Door Car 1131.464935 NaN NaN \n", + "2 Two-Door Car 566.472247 NaN NaN \n", + "3 SUV 529.881344 NaN NaN \n", + "4 Four-Door Car 17.269323 NaN NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.concat([df1, df2, df3], ignore_index=True)\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7ec71551-1d1e-42e7-857d-f6182c4de520", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount', 'state',\n", + " 'gender'],\n", + " dtype='object')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns = (\n", + " df.columns\n", + " .str.replace(\"ST\", \"state\", regex=False)\n", + " .str.lower()\n", + " .str.strip()\n", + " .str.replace(\" \", \"_\")\n", + ")\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "54754960-f4af-4415-bf02-f7934dc2b64e", + "metadata": {}, + "outputs": [], + "source": [ + "df['gender'] = df['gender'].replace({\n", + " 'F': 'F', 'Femal': 'F', 'Female': 'F', 'female': 'F',\n", + " 'M': 'M', 'Male': 'M', 'male': 'M'\n", + "})\n", + "\n", + "df['state'] = df['state'].replace({\n", + " 'AZ': 'Arizona',\n", + " 'Cali': 'California',\n", + " 'WA': 'Washington'\n", + "})\n", + "\n", + "df['education'] = df['education'].replace({'Bachelors': 'Bachelor'})\n", + "\n", + "df['customer_lifetime_value'] = (\n", + " df['customer_lifetime_value']\n", + " .astype(str)\n", + " .str.replace('%', '', regex=False)\n", + " .astype(float)\n", + ")\n", + "\n", + "df['vehicle_class'] = df['vehicle_class'].replace({\n", + " 'Sports Car': 'Luxury',\n", + " 'Luxury SUV': 'Luxury',\n", + " 'Luxury Car': 'Luxury'\n", + "})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5f348734-53b9-46c1-b1f5-90be4b327f48", + "metadata": {}, + "outputs": [], + "source": [ + "df['number_of_open_complaints'] = (\n", + " df['number_of_open_complaints']\n", + " .astype(str)\n", + " .str.split('/')\n", + " .str[1]\n", + " .astype(float)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8b5cee0d-fa6c-47aa-8df5-4a9ff3120651", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 10007\n", + "gender 10129\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 10007\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "state 5004\n", + "gender 5004\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7aa802df-5ef5-4794-885a-ec3cb29c3aff", + "metadata": {}, + "outputs": [], + "source": [ + "df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d68dca87-ef24-4d10-85d3-d577d5bd9c11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 10007\n", + "gender 10129\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 0\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "state 5004\n", + "gender 5004\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "87a9b5cd-2464-4818-87e4-6e5ee153e91b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountstategender
0RB50392WashingtonNaNMasterNaN0.01000.00.0Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor6.979536e+050.094.00.0Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1.288743e+0648767.0108.00.0Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor7.645862e+050.0106.00.0Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below5.363077e+0536357.068.00.0Personal AutoFour-Door Car17.269323NaNNaN
..........................................
9130LA72316NaNNaNBachelor2.340599e+0471941.073.00.0Personal AutoFour-Door Car198.234764CaliforniaM
9131PK87824NaNNaNCollege3.096511e+0321604.079.00.0Corporate AutoFour-Door Car379.200000CaliforniaF
9132TD14365NaNNaNBachelor8.163890e+030.085.00.0Corporate AutoFour-Door Car790.784983CaliforniaM
9133UP19263NaNNaNCollege7.524442e+0321941.096.00.0Personal AutoFour-Door Car691.200000CaliforniaM
9134Y167826NaNNaNCollege2.611837e+030.077.00.0Corporate AutoTwo-Door Car369.600000CaliforniaM
\n", + "

9135 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "9130 LA72316 NaN NaN Bachelor \n", + "9131 PK87824 NaN NaN College \n", + "9132 TD14365 NaN NaN Bachelor \n", + "9133 UP19263 NaN NaN College \n", + "9134 Y167826 NaN NaN College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 6.979536e+05 0.0 94.0 \n", + "2 1.288743e+06 48767.0 108.0 \n", + "3 7.645862e+05 0.0 106.0 \n", + "4 5.363077e+05 36357.0 68.0 \n", + "... ... ... ... \n", + "9130 2.340599e+04 71941.0 73.0 \n", + "9131 3.096511e+03 21604.0 79.0 \n", + "9132 8.163890e+03 0.0 85.0 \n", + "9133 7.524442e+03 21941.0 96.0 \n", + "9134 2.611837e+03 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0.0 Personal Auto Four-Door Car \n", + "1 0.0 Personal Auto Four-Door Car \n", + "2 0.0 Personal Auto Two-Door Car \n", + "3 0.0 Corporate Auto SUV \n", + "4 0.0 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "9130 0.0 Personal Auto Four-Door Car \n", + "9131 0.0 Corporate Auto Four-Door Car \n", + "9132 0.0 Corporate Auto Four-Door Car \n", + "9133 0.0 Personal Auto Four-Door Car \n", + "9134 0.0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount state gender \n", + "0 2.704934 NaN NaN \n", + "1 1131.464935 NaN NaN \n", + "2 566.472247 NaN NaN \n", + "3 529.881344 NaN NaN \n", + "4 17.269323 NaN NaN \n", + "... ... ... ... \n", + "9130 198.234764 California M \n", + "9131 379.200000 California F \n", + "9132 790.784983 California M \n", + "9133 691.200000 California M \n", + "9134 369.600000 California M \n", + "\n", + "[9135 rows x 13 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop_duplicates().reset_index(drop=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac898103-cfef-4501-b55b-9d3c16433401", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "67ffbb9c-8393-45f0-9547-a6526ec9774a", + "metadata": {}, + "outputs": [], + "source": [ + "num_cols = df.select_dtypes(include=['float64', 'int64']).columns\n", + "df[num_cols] = df[num_cols].fillna(0)\n", + "#df[num_cols].isnull().sum()\n", + "df[num_cols] = df[num_cols].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "43505402-b509-4ecf-b2da-90bd98790547", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9135 entries, 0 to 9134\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 customer 9134 non-null object\n", + " 1 state 2064 non-null object\n", + " 2 gender 1942 non-null object\n", + " 3 education 9134 non-null object\n", + " 4 customer_lifetime_value 9135 non-null int64 \n", + " 5 income 9135 non-null int64 \n", + " 6 monthly_premium_auto 9135 non-null int64 \n", + " 7 number_of_open_complaints 9135 non-null int64 \n", + " 8 policy_type 9134 non-null object\n", + " 9 vehicle_class 9134 non-null object\n", + " 10 total_claim_amount 9135 non-null int64 \n", + " 11 state 7070 non-null object\n", + " 12 gender 7070 non-null object\n", + "dtypes: int64(5), object(8)\n", + "memory usage: 927.9+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountstategender
0RB50392WashingtonNaNMaster0010000Personal AutoFour-Door Car2NaNNaN
1QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131NaNNaN
2AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566NaNNaN
3WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529NaNNaN
4GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17NaNNaN
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington NaN Master 0 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 1000 0 Personal Auto \n", + "1 0 94 0 Personal Auto \n", + "2 48767 108 0 Personal Auto \n", + "3 0 106 0 Corporate Auto \n", + "4 36357 68 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount state gender \n", + "0 Four-Door Car 2 NaN NaN \n", + "1 Four-Door Car 1131 NaN NaN \n", + "2 Two-Door Car 566 NaN NaN \n", + "3 SUV 529 NaN NaN \n", + "4 Four-Door Car 17 NaN NaN " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10910, 27)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "743f1e66-d458-4747-ba43-c7b6d031722c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 unnamed:_0 10910 non-null int64 \n", + " 1 customer 10910 non-null object \n", + " 2 state 10910 non-null object \n", + " 3 customer_lifetime_value 10910 non-null float64\n", + " 4 response 10910 non-null object \n", + " 5 coverage 10910 non-null object \n", + " 6 education 10910 non-null object \n", + " 7 effective_to_date 10910 non-null object \n", + " 8 employmentstatus 10910 non-null object \n", + " 9 gender 10910 non-null object \n", + " 10 income 10910 non-null int64 \n", + " 11 location_code 10910 non-null object \n", + " 12 marital_status 10910 non-null object \n", + " 13 monthly_premium_auto 10910 non-null int64 \n", + " 14 months_since_last_claim 10910 non-null float64\n", + " 15 months_since_policy_inception 10910 non-null int64 \n", + " 16 number_of_open_complaints 10910 non-null float64\n", + " 17 number_of_policies 10910 non-null int64 \n", + " 18 policy_type 10910 non-null object \n", + " 19 policy 10910 non-null object \n", + " 20 renew_offer_type 10910 non-null object \n", + " 21 sales_channel 10910 non-null object \n", + " 22 total_claim_amount 10910 non-null float64\n", + " 23 vehicle_class 10910 non-null object \n", + " 24 vehicle_size 10910 non-null object \n", + " 25 vehicle_type 10910 non-null object \n", + " 26 month 10910 non-null int64 \n", + "dtypes: float64(4), int64(6), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + }, + { + "data": { + "text/plain": [ + "unnamed:_0 0\n", + "customer 0\n", + "state 0\n", + "customer_lifetime_value 0\n", + "response 0\n", + "coverage 0\n", + "education 0\n", + "effective_to_date 0\n", + "employmentstatus 0\n", + "gender 0\n", + "income 0\n", + "location_code 0\n", + "marital_status 0\n", + "monthly_premium_auto 0\n", + "months_since_last_claim 0\n", + "months_since_policy_inception 0\n", + "number_of_open_complaints 0\n", + "number_of_policies 0\n", + "policy_type 0\n", + "policy 0\n", + "renew_offer_type 0\n", + "sales_channel 0\n", + "total_claim_amount 0\n", + "vehicle_class 0\n", + "vehicle_size 0\n", + "vehicle_type 0\n", + "month 0\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "df.describe()\n", + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "28a56e42-3a6e-4f75-b978-dc495e29a452", + "metadata": {}, + "outputs": [], + "source": [ + "demographics = df[['customer', 'gender', 'education', 'marital_status', 'income']]\n", + "\n", + "policy_info = df[['policy_type', 'policy', 'vehicle_class']]\n", + "\n", + "marketing = df[['response', 'coverage', 'sales_channel']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d1830e4-e285-4855-960f-15c4aaf79de5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1d47d409-38ca-4884-b081-4c10340eb3f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04\n" + ] + } + ], + "source": [ + "pivot_revenue = pd.pivot_table(\n", + " df,\n", + " values='total_claim_amount',\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "print(pivot_revenue)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f65a657b-0a7e-4fc9-9a35-457d8e2060c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "pivot_clv = pd.pivot_table(\n", + " df,\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "print(pivot_clv)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "675082f1-2dd0-4096-9e77-99716730b4db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto February 1089\n", + "1 Corporate Auto January 1252\n", + "2 Personal Auto February 3799\n", + "3 Personal Auto January 4329\n", + "4 Special Auto February 204\n" + ] + } + ], + "source": [ + "df['month'] = pd.to_datetime(df['effective_to_date']).dt.month_name()\n", + "\n", + "pivot_complaints = pd.pivot_table(\n", + " df,\n", + " values='number_of_open_complaints',\n", + " index=['policy_type', 'month'],\n", + " aggfunc='count'\n", + ").reset_index()\n", + "\n", + "print(pivot_complaints.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9727d1a-797c-4eb5-9ec3-004765f1bded", + "metadata": {}, + "outputs": [], + "source": [ + "Shows which months there are more complaints per policy category.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "737a4b0f-db95-4357-8e3d-fb7aab35125c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..f093a47 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,856 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4008, 11) (996, 11) (7070, 11)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")\n", + "\n", + "print(df1.shape, df2.shape, df3.shape)\n", + "#df1.head()\n", + "#df2.head()\n", + "#df3.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d669a282-5584-49aa-9db6-39f49e0c9139", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(12074, 13)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim AmountStateGender
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323NaNNaN
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount State Gender \n", + "0 Four-Door Car 2.704934 NaN NaN \n", + "1 Four-Door Car 1131.464935 NaN NaN \n", + "2 Two-Door Car 566.472247 NaN NaN \n", + "3 SUV 529.881344 NaN NaN \n", + "4 Four-Door Car 17.269323 NaN NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.concat([df1, df2, df3], ignore_index=True)\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7ec71551-1d1e-42e7-857d-f6182c4de520", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['customer', 'state', 'gender', 'education', 'customer_lifetime_value',\n", + " 'income', 'monthly_premium_auto', 'number_of_open_complaints',\n", + " 'policy_type', 'vehicle_class', 'total_claim_amount', 'state',\n", + " 'gender'],\n", + " dtype='object')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns = (\n", + " df.columns\n", + " .str.replace(\"ST\", \"state\", regex=False)\n", + " .str.lower()\n", + " .str.strip()\n", + " .str.replace(\" \", \"_\")\n", + ")\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "54754960-f4af-4415-bf02-f7934dc2b64e", + "metadata": {}, + "outputs": [], + "source": [ + "df['gender'] = df['gender'].replace({\n", + " 'F': 'F', 'Femal': 'F', 'Female': 'F', 'female': 'F',\n", + " 'M': 'M', 'Male': 'M', 'male': 'M'\n", + "})\n", + "\n", + "df['state'] = df['state'].replace({\n", + " 'AZ': 'Arizona',\n", + " 'Cali': 'California',\n", + " 'WA': 'Washington'\n", + "})\n", + "\n", + "df['education'] = df['education'].replace({'Bachelors': 'Bachelor'})\n", + "\n", + "df['customer_lifetime_value'] = (\n", + " df['customer_lifetime_value']\n", + " .astype(str)\n", + " .str.replace('%', '', regex=False)\n", + " .astype(float)\n", + ")\n", + "\n", + "df['vehicle_class'] = df['vehicle_class'].replace({\n", + " 'Sports Car': 'Luxury',\n", + " 'Luxury SUV': 'Luxury',\n", + " 'Luxury Car': 'Luxury'\n", + "})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5f348734-53b9-46c1-b1f5-90be4b327f48", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "df['number_of_open_complaints'] = (\n", + " df['number_of_open_complaints']\n", + " .astype(str)\n", + " .str.split('/')\n", + " .str[1]\n", + " .astype(float)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8b5cee0d-fa6c-47aa-8df5-4a9ff3120651", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 10007\n", + "gender 10129\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 10007\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "state 5004\n", + "gender 5004\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7aa802df-5ef5-4794-885a-ec3cb29c3aff", + "metadata": {}, + "outputs": [], + "source": [ + "df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d68dca87-ef24-4d10-85d3-d577d5bd9c11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer 2937\n", + "state 10007\n", + "gender 10129\n", + "education 2937\n", + "customer_lifetime_value 2944\n", + "income 2937\n", + "monthly_premium_auto 2937\n", + "number_of_open_complaints 0\n", + "policy_type 2937\n", + "vehicle_class 2937\n", + "total_claim_amount 2937\n", + "state 5004\n", + "gender 5004\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "87a9b5cd-2464-4818-87e4-6e5ee153e91b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountstategender
0RB50392WashingtonNaNMasterNaN0.01000.00.0Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor6.979536e+050.094.00.0Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1.288743e+0648767.0108.00.0Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor7.645862e+050.0106.00.0Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below5.363077e+0536357.068.00.0Personal AutoFour-Door Car17.269323NaNNaN
..........................................
9130LA72316NaNNaNBachelor2.340599e+0471941.073.00.0Personal AutoFour-Door Car198.234764CaliforniaM
9131PK87824NaNNaNCollege3.096511e+0321604.079.00.0Corporate AutoFour-Door Car379.200000CaliforniaF
9132TD14365NaNNaNBachelor8.163890e+030.085.00.0Corporate AutoFour-Door Car790.784983CaliforniaM
9133UP19263NaNNaNCollege7.524442e+0321941.096.00.0Personal AutoFour-Door Car691.200000CaliforniaM
9134Y167826NaNNaNCollege2.611837e+030.077.00.0Corporate AutoTwo-Door Car369.600000CaliforniaM
\n", + "

9135 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " customer state gender education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "9130 LA72316 NaN NaN Bachelor \n", + "9131 PK87824 NaN NaN College \n", + "9132 TD14365 NaN NaN Bachelor \n", + "9133 UP19263 NaN NaN College \n", + "9134 Y167826 NaN NaN College \n", + "\n", + " customer_lifetime_value income monthly_premium_auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 6.979536e+05 0.0 94.0 \n", + "2 1.288743e+06 48767.0 108.0 \n", + "3 7.645862e+05 0.0 106.0 \n", + "4 5.363077e+05 36357.0 68.0 \n", + "... ... ... ... \n", + "9130 2.340599e+04 71941.0 73.0 \n", + "9131 3.096511e+03 21604.0 79.0 \n", + "9132 8.163890e+03 0.0 85.0 \n", + "9133 7.524442e+03 21941.0 96.0 \n", + "9134 2.611837e+03 0.0 77.0 \n", + "\n", + " number_of_open_complaints policy_type vehicle_class \\\n", + "0 0.0 Personal Auto Four-Door Car \n", + "1 0.0 Personal Auto Four-Door Car \n", + "2 0.0 Personal Auto Two-Door Car \n", + "3 0.0 Corporate Auto SUV \n", + "4 0.0 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "9130 0.0 Personal Auto Four-Door Car \n", + "9131 0.0 Corporate Auto Four-Door Car \n", + "9132 0.0 Corporate Auto Four-Door Car \n", + "9133 0.0 Personal Auto Four-Door Car \n", + "9134 0.0 Corporate Auto Two-Door Car \n", + "\n", + " total_claim_amount state gender \n", + "0 2.704934 NaN NaN \n", + "1 1131.464935 NaN NaN \n", + "2 566.472247 NaN NaN \n", + "3 529.881344 NaN NaN \n", + "4 17.269323 NaN NaN \n", + "... ... ... ... \n", + "9130 198.234764 California M \n", + "9131 379.200000 California F \n", + "9132 790.784983 California M \n", + "9133 691.200000 California M \n", + "9134 369.600000 California M \n", + "\n", + "[9135 rows x 13 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop_duplicates().reset_index(drop=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac898103-cfef-4501-b55b-9d3c16433401", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "67ffbb9c-8393-45f0-9547-a6526ec9774a", + "metadata": {}, + "outputs": [], + "source": [ + "num_cols = df.select_dtypes(include=['float64', 'int64']).columns\n", + "df[num_cols] = df[num_cols].fillna(0)\n", + "#df[num_cols].isnull().sum()\n", + "df[num_cols] = df[num_cols].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "43505402-b509-4ecf-b2da-90bd98790547", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9135 entries, 0 to 9134\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 customer 9134 non-null object\n", + " 1 state 2064 non-null object\n", + " 2 gender 1942 non-null object\n", + " 3 education 9134 non-null object\n", + " 4 customer_lifetime_value 9135 non-null int64 \n", + " 5 income 9135 non-null int64 \n", + " 6 monthly_premium_auto 9135 non-null int64 \n", + " 7 number_of_open_complaints 9135 non-null int64 \n", + " 8 policy_type 9134 non-null object\n", + " 9 vehicle_class 9134 non-null object\n", + " 10 total_claim_amount 9135 non-null int64 \n", + " 11 state 7070 non-null object\n", + " 12 gender 7070 non-null object\n", + "dtypes: int64(5), object(8)\n", + "memory usage: 927.9+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountstategender
0RB50392WashingtonNaNMaster0010000Personal AutoFour-Door Car2NaNNaN
1QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131NaNNaN
2AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566NaNNaN
3WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529NaNNaN
4GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17NaNNaN
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington NaN Master 0 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 1000 0 Personal Auto \n", + "1 0 94 0 Personal Auto \n", + "2 48767 108 0 Personal Auto \n", + "3 0 106 0 Corporate Auto \n", + "4 36357 68 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount state gender \n", + "0 Four-Door Car 2 NaN NaN \n", + "1 Four-Door Car 1131 NaN NaN \n", + "2 Two-Door Car 566 NaN NaN \n", + "3 SUV 529 NaN NaN \n", + "4 Four-Door Car 17 NaN NaN " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "df.head()" ] }, { @@ -72,16 +914,351 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10910, 27)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "743f1e66-d458-4747-ba43-c7b6d031722c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 unnamed:_0 10910 non-null int64 \n", + " 1 customer 10910 non-null object \n", + " 2 state 10910 non-null object \n", + " 3 customer_lifetime_value 10910 non-null float64\n", + " 4 response 10910 non-null object \n", + " 5 coverage 10910 non-null object \n", + " 6 education 10910 non-null object \n", + " 7 effective_to_date 10910 non-null object \n", + " 8 employmentstatus 10910 non-null object \n", + " 9 gender 10910 non-null object \n", + " 10 income 10910 non-null int64 \n", + " 11 location_code 10910 non-null object \n", + " 12 marital_status 10910 non-null object \n", + " 13 monthly_premium_auto 10910 non-null int64 \n", + " 14 months_since_last_claim 10910 non-null float64\n", + " 15 months_since_policy_inception 10910 non-null int64 \n", + " 16 number_of_open_complaints 10910 non-null float64\n", + " 17 number_of_policies 10910 non-null int64 \n", + " 18 policy_type 10910 non-null object \n", + " 19 policy 10910 non-null object \n", + " 20 renew_offer_type 10910 non-null object \n", + " 21 sales_channel 10910 non-null object \n", + " 22 total_claim_amount 10910 non-null float64\n", + " 23 vehicle_class 10910 non-null object \n", + " 24 vehicle_size 10910 non-null object \n", + " 25 vehicle_type 10910 non-null object \n", + " 26 month 10910 non-null int64 \n", + "dtypes: float64(4), int64(6), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + }, + { + "data": { + "text/plain": [ + "unnamed:_0 0\n", + "customer 0\n", + "state 0\n", + "customer_lifetime_value 0\n", + "response 0\n", + "coverage 0\n", + "education 0\n", + "effective_to_date 0\n", + "employmentstatus 0\n", + "gender 0\n", + "income 0\n", + "location_code 0\n", + "marital_status 0\n", + "monthly_premium_auto 0\n", + "months_since_last_claim 0\n", + "months_since_policy_inception 0\n", + "number_of_open_complaints 0\n", + "number_of_policies 0\n", + "policy_type 0\n", + "policy 0\n", + "renew_offer_type 0\n", + "sales_channel 0\n", + "total_claim_amount 0\n", + "vehicle_class 0\n", + "vehicle_size 0\n", + "vehicle_type 0\n", + "month 0\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "df.describe()\n", + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "28a56e42-3a6e-4f75-b978-dc495e29a452", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "demographics = df[['customer', 'gender', 'education', 'marital_status', 'income']]\n", + "\n", + "policy_info = df[['policy_type', 'policy', 'vehicle_class']]\n", + "\n", + "marketing = df[['response', 'coverage', 'sales_channel']]" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d1830e4-e285-4855-960f-15c4aaf79de5", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", @@ -128,17 +1305,114 @@ "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" ] }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1d47d409-38ca-4884-b081-4c10340eb3f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04\n" + ] + } + ], + "source": [ + "pivot_revenue = pd.pivot_table(\n", + " df,\n", + " values='total_claim_amount',\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "print(pivot_revenue)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f65a657b-0a7e-4fc9-9a35-457d8e2060c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "pivot_clv = pd.pivot_table(\n", + " df,\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "print(pivot_clv)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "675082f1-2dd0-4096-9e77-99716730b4db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto February 1089\n", + "1 Corporate Auto January 1252\n", + "2 Personal Auto February 3799\n", + "3 Personal Auto January 4329\n", + "4 Special Auto February 204\n" + ] + } + ], + "source": [ + "df['month'] = pd.to_datetime(df['effective_to_date']).dt.month_name()\n", + "\n", + "pivot_complaints = pd.pivot_table(\n", + " df,\n", + " values='number_of_open_complaints',\n", + " index=['policy_type', 'month'],\n", + " aggfunc='count'\n", + ").reset_index()\n", + "\n", + "print(pivot_complaints.head())" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "3a069e0b-b400-470e-904d-d17582191be4", - "metadata": { - "id": "3a069e0b-b400-470e-904d-d17582191be4" - }, + "id": "a9727d1a-797c-4eb5-9ec3-004765f1bded", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "Shows which months there are more complaints per policy category.\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "737a4b0f-db95-4357-8e3d-fb7aab35125c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -146,7 +1420,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -160,7 +1434,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.7" } }, "nbformat": 4,