Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 210 additions & 2 deletions lab-dw-aggregating.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,77 @@
" - have a response \"Yes\" to the last marketing campaign."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3c78618b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" total_claim_amount response\n",
"0 292.800000 No\n",
"1 744.924331 No\n",
"2 480.000000 No\n",
"3 484.013411 Yes\n",
"4 707.925645 No\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Load the dataset from the URL\n",
"url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n",
"df = pd.read_csv(url)\n",
"\n",
"# Standardize column names\n",
"df.columns = df.columns.str.lower().str.replace(' ', '_')\n",
"\n",
"print(df[['total_claim_amount', 'response']].head())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e9b16011",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered DataFrame (low claim + said Yes):\n",
"(1399, 26)\n",
" customer total_claim_amount response gender education\n",
"0 XL78013 484.013411 Yes M College\n",
"1 FM55990 739.200000 Yes M College\n",
"2 CW49887 547.200000 Yes F Master\n",
"3 NJ54277 19.575683 Yes F College\n",
"4 MQ68407 60.036683 Yes F Bachelor\n"
]
}
],
"source": [
"# Clean the 'response' column (remove extra spaces and standardize)\n",
"df['response'] = df['response'].astype(str).str.strip().str.title()\n",
"\n",
"# Create the filtered DataFrame\n",
"low_claim_yes_response = df[\n",
" (df['total_claim_amount'] < 1000) &\n",
" (df['response'] == 'Yes')\n",
"].copy()\n",
"\n",
"# Reset index (optional)\n",
"low_claim_yes_response.reset_index(drop=True, inplace=True)\n",
"\n",
"print(\"Filtered DataFrame (low claim + said Yes):\")\n",
"print(low_claim_yes_response.shape)\n",
"print(low_claim_yes_response[['customer', 'total_claim_amount', 'response', 'gender', 'education']].head()) "
]
},
{
"cell_type": "markdown",
"id": "b9be383e-5165-436e-80c8-57d4c757c8c3",
Expand All @@ -48,6 +119,55 @@
" - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c347fa94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Simple Analysis by Policy Type:\n",
"\n",
"\n",
" avg_premium avg_claims customer_count profit\n",
"policy_type \n",
"Corporate Auto 93.29 421.74 323 -328.45\n",
"Personal Auto 95.06 454.98 1076 -359.92\n",
"Special Auto 89.46 441.94 67 -352.48\n",
"\n",
"Which policy type is most profitable?\n",
"\n",
"\n",
"Most profitable: Corporate Auto ($-328.45 profit per customer)\n"
]
}
],
"source": [
"yes_customers = df[df['response'] == 'Yes']\n",
"\n",
"# Simple group by policy type only (easier to understand)\n",
"simple_analysis = yes_customers.groupby('policy_type').agg({\n",
" 'monthly_premium_auto': 'mean',\n",
" 'total_claim_amount': 'mean',\n",
" 'customer': 'count'\n",
"}).round(2)\n",
"\n",
"simple_analysis.columns = ['avg_premium', 'avg_claims', 'customer_count']\n",
"simple_analysis['profit'] = simple_analysis['avg_premium'] - simple_analysis['avg_claims']\n",
"\n",
"print(\"Simple Analysis by Policy Type:\")\n",
"print(\"\\n\")\n",
"print(simple_analysis)\n",
"\n",
"print(\"\\nWhich policy type is most profitable?\") \n",
"print(\"\\n\")\n",
"most_profitable = simple_analysis['profit'].idxmax()\n",
"print(f\"Most profitable: {most_profitable} (${simple_analysis.loc[most_profitable, 'profit']:.2f} profit per customer)\")"
]
},
{
"cell_type": "markdown",
"id": "7050f4ac-53c5-4193-a3c0-8699b87196f0",
Expand All @@ -58,6 +178,60 @@
"3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "afa91674",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All states and their customer counts:\n",
"\n",
"\n",
" state customer_count\n",
"0 California 3552\n",
"1 Oregon 2909\n",
"2 Arizona 1937\n",
"3 Nevada 993\n",
"4 Washington 888\n",
"\n",
"\n",
"STATES WITH 500+ CUSTOMERS:\n",
"\n",
"\n",
" state customer_count\n",
"0 California 3552\n",
"1 Oregon 2909\n",
"2 Arizona 1937\n",
"3 Nevada 993\n",
"4 Washington 888\n",
"\n",
"There are 5 states with more than 500 customers\n"
]
}
],
"source": [
"state_counts = df['state'].value_counts().reset_index()\n",
"state_counts.columns = ['state', 'customer_count']\n",
"\n",
"print(\"All states and their customer counts:\")\n",
"print(\"\\n\")\n",
"print(state_counts)\n",
"\n",
"# Filter for states with more than 500 customers\n",
"popular_states = state_counts[state_counts['customer_count'] > 500]\n",
"\n",
"print(\"\\n\")\n",
"print(\"STATES WITH 500+ CUSTOMERS:\")\n",
"print(\"\\n\")\n",
"print(popular_states)\n",
"\n",
"print(f\"\\nThere are {len(popular_states)} states with more than 500 customers\")"
]
},
{
"cell_type": "markdown",
"id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d",
Expand All @@ -68,6 +242,40 @@
"4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "27b8ac80",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" education gender max_clv min_clv median_clv\n",
"0 Bachelor F 73225.96 1904.00 5640.51\n",
"1 Bachelor M 67907.27 1898.01 5548.03\n",
"2 College F 61850.19 1898.68 5623.61\n",
"3 College M 61134.68 1918.12 6005.85\n",
"4 Doctor F 44856.11 2395.57 5332.46\n",
"5 Doctor M 32677.34 2267.60 5577.67\n",
"6 High School or Below F 55277.45 2144.92 6039.55\n",
"7 High School or Below M 83325.38 1940.98 6286.73\n",
"8 Master F 51016.07 2417.78 5729.86\n",
"9 Master M 50568.26 2272.31 5579.10\n"
]
}
],
"source": [
"clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg([\n",
" ('max_clv', 'max'),\n",
" ('min_clv', 'min'), \n",
" ('median_clv', 'median')\n",
"]).round(2).reset_index()\n",
"\n",
"print(clv_stats)"
]
},
{
"cell_type": "markdown",
"id": "b42999f9-311f-481e-ae63-40a5577072c5",
Expand Down Expand Up @@ -143,7 +351,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -157,7 +365,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down