diff --git a/lab-dw-data-structuring-and-combining-SOLVED.ipynb b/lab-dw-data-structuring-and-combining-SOLVED.ipynb new file mode 100644 index 0000000..e035fc8 --- /dev/null +++ b/lab-dw-data-structuring-and-combining-SOLVED.ipynb @@ -0,0 +1,722 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "\n", + "# Load all three CSV files\n", + "file1 = pd.read_csv(\"filetwo.csv\") # file1 is mislabeled, it's actually file2\n", + "file2 = pd.read_csv(\"filethree.csv\") # file2 is mislabeled, it's actually file3\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e0674f28-82bd-4711-898a-e3425ed7b487", + "metadata": {}, + "outputs": [], + "source": [ + "# Standardize column names\n", + "def standardize_columns(df):\n", + " df.columns = df.columns.str.lower().str.replace(\" \", \"_\")\n", + " return df\n", + "\n", + "file1 = standardize_columns(file1)\n", + "file2 = standardize_columns(file2)\n", + "\n", + "# Clean file1 (remove '%' and extract complaint numbers)\n", + "file1['customer_lifetime_value'] = file1['customer_lifetime_value'].str.replace('%', '').astype(float) / 100\n", + "file1['number_of_open_complaints'] = file1['number_of_open_complaints'].str.extract('(\\d+)').astype(float)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "92da7f4c-cd97-475a-b893-9f0e9dc6c9c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstgendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintstotal_claim_amountpolicy_typevehicle_classstate
0GS98873ArizonaFBachelor3239.124716061881.0633.6Personal AutoFour-Door CarNaN
1CW49887CaliforniaFMaster4626.8011794871141.0547.2Special AutoSUVNaN
2MY31220CaliforniaFCollege8997.0402542301121.0537.6Personal AutoTwo-Door CarNaN
3UH35128OregonFCollege25807.0630712102141.01027.2Personal AutoLuxury CarNaN
4WH52799ArizonaFCollege3808.122194903941.0451.2Corporate AutoTwo-Door CarNaN
\n", + "
" + ], + "text/plain": [ + " customer st gender education customer_lifetime_value income \\\n", + "0 GS98873 Arizona F Bachelor 3239.1247 16061 \n", + "1 CW49887 California F Master 4626.8011 79487 \n", + "2 MY31220 California F College 8997.0402 54230 \n", + "3 UH35128 Oregon F College 25807.0630 71210 \n", + "4 WH52799 Arizona F College 3808.1221 94903 \n", + "\n", + " monthly_premium_auto number_of_open_complaints total_claim_amount \\\n", + "0 88 1.0 633.6 \n", + "1 114 1.0 547.2 \n", + "2 112 1.0 537.6 \n", + "3 214 1.0 1027.2 \n", + "4 94 1.0 451.2 \n", + "\n", + " policy_type vehicle_class state \n", + "0 Personal Auto Four-Door Car NaN \n", + "1 Special Auto SUV NaN \n", + "2 Personal Auto Two-Door Car NaN \n", + "3 Personal Auto Luxury Car NaN \n", + "4 Corporate Auto Two-Door Car NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Combine both cleaned dataframes\n", + "combined_data = pd.concat([file1, file2], ignore_index=True)\n", + "combined_data.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [], + "source": [ + "# Load marketing dataset\n", + "marketing_df = pd.read_csv(\"marketing_customer_analysis_clean.csv\")\n", + "\n", + "# Standardize column names\n", + "marketing_df.columns = marketing_df.columns.str.lower().str.replace(\" \", \"_\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d4eb8e1e-790b-4eff-85fb-41a859c8c0fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Total revenue by sales channel\n", + "revenue_by_channel = marketing_df.pivot_table(\n", + " index='sales_channel',\n", + " values='total_claim_amount',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "revenue_by_channel\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "639d7348-ee0c-402f-b3fc-db4ad664e8de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Total revenue by sales channel\n", + "revenue_by_channel = marketing_df.pivot_table(\n", + " index='sales_channel',\n", + " values='total_claim_amount',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "revenue_by_channel\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cdfb2193-bf11-4e17-a501-96ce3586bc4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
gendereducation
FBachelor7874.27
College7748.82
Doctor7328.51
High School or Below8675.22
Master8157.05
MBachelor7703.60
College8052.46
Doctor7415.33
High School or Below8149.69
Master8168.83
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value\n", + "gender education \n", + "F Bachelor 7874.27\n", + " College 7748.82\n", + " Doctor 7328.51\n", + " High School or Below 8675.22\n", + " Master 8157.05\n", + "M Bachelor 7703.60\n", + " College 8052.46\n", + " Doctor 7415.33\n", + " High School or Below 8149.69\n", + " Master 8168.83" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Average CLV by gender and education\n", + "clv_by_group = marketing_df.pivot_table(\n", + " index=['gender', 'education'],\n", + " values='customer_lifetime_value',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "clv_by_group\n" + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typemonthnumber_of_open_complaints
0Corporate AutoFebruary385.208135
1Corporate AutoJanuary443.434952
2Personal AutoFebruary1453.684441
3Personal AutoJanuary1727.605722
4Special AutoFebruary95.226817
5Special AutoJanuary87.074049
\n", + "
" + ], + "text/plain": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto February 385.208135\n", + "1 Corporate Auto January 443.434952\n", + "2 Personal Auto February 1453.684441\n", + "3 Personal Auto January 1727.605722\n", + "4 Special Auto February 95.226817\n", + "5 Special Auto January 87.074049" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert to datetime and extract month\n", + "marketing_df['effective_to_date'] = pd.to_datetime(marketing_df['effective_to_date'])\n", + "marketing_df['month'] = marketing_df['effective_to_date'].dt.month_name()\n", + "\n", + "# Complaints by policy type and month\n", + "complaints_table = marketing_df.groupby(['policy_type', 'month'])['number_of_open_complaints'].sum().reset_index()\n", + "complaints_table\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab01dff8-1e38-41c0-825a-78268c3bcf8f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}