diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fadd718..c6e4bc0 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -36,6 +36,77 @@ " - have a response \"Yes\" to the last marketing campaign." ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3c78618b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount response\n", + "0 292.800000 No\n", + "1 744.924331 No\n", + "2 480.000000 No\n", + "3 484.013411 Yes\n", + "4 707.925645 No\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load the dataset from the URL\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "# Standardize column names\n", + "df.columns = df.columns.str.lower().str.replace(' ', '_')\n", + "\n", + "print(df[['total_claim_amount', 'response']].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e9b16011", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered DataFrame (low claim + said Yes):\n", + "(1399, 26)\n", + " customer total_claim_amount response gender education\n", + "0 XL78013 484.013411 Yes M College\n", + "1 FM55990 739.200000 Yes M College\n", + "2 CW49887 547.200000 Yes F Master\n", + "3 NJ54277 19.575683 Yes F College\n", + "4 MQ68407 60.036683 Yes F Bachelor\n" + ] + } + ], + "source": [ + "# Clean the 'response' column (remove extra spaces and standardize)\n", + "df['response'] = df['response'].astype(str).str.strip().str.title()\n", + "\n", + "# Create the filtered DataFrame\n", + "low_claim_yes_response = df[\n", + " (df['total_claim_amount'] < 1000) &\n", + " (df['response'] == 'Yes')\n", + "].copy()\n", + "\n", + "# Reset index (optional)\n", + "low_claim_yes_response.reset_index(drop=True, inplace=True)\n", + "\n", + "print(\"Filtered DataFrame (low claim + said Yes):\")\n", + "print(low_claim_yes_response.shape)\n", + "print(low_claim_yes_response[['customer', 'total_claim_amount', 'response', 'gender', 'education']].head()) " + ] + }, { "cell_type": "markdown", "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", @@ -48,6 +119,55 @@ " - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company." ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c347fa94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simple Analysis by Policy Type:\n", + "\n", + "\n", + " avg_premium avg_claims customer_count profit\n", + "policy_type \n", + "Corporate Auto 93.29 421.74 323 -328.45\n", + "Personal Auto 95.06 454.98 1076 -359.92\n", + "Special Auto 89.46 441.94 67 -352.48\n", + "\n", + "Which policy type is most profitable?\n", + "\n", + "\n", + "Most profitable: Corporate Auto ($-328.45 profit per customer)\n" + ] + } + ], + "source": [ + "yes_customers = df[df['response'] == 'Yes']\n", + "\n", + "# Simple group by policy type only (easier to understand)\n", + "simple_analysis = yes_customers.groupby('policy_type').agg({\n", + " 'monthly_premium_auto': 'mean',\n", + " 'total_claim_amount': 'mean',\n", + " 'customer': 'count'\n", + "}).round(2)\n", + "\n", + "simple_analysis.columns = ['avg_premium', 'avg_claims', 'customer_count']\n", + "simple_analysis['profit'] = simple_analysis['avg_premium'] - simple_analysis['avg_claims']\n", + "\n", + "print(\"Simple Analysis by Policy Type:\")\n", + "print(\"\\n\")\n", + "print(simple_analysis)\n", + "\n", + "print(\"\\nWhich policy type is most profitable?\") \n", + "print(\"\\n\")\n", + "most_profitable = simple_analysis['profit'].idxmax()\n", + "print(f\"Most profitable: {most_profitable} (${simple_analysis.loc[most_profitable, 'profit']:.2f} profit per customer)\")" + ] + }, { "cell_type": "markdown", "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", @@ -58,6 +178,60 @@ "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "afa91674", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All states and their customer counts:\n", + "\n", + "\n", + " state customer_count\n", + "0 California 3552\n", + "1 Oregon 2909\n", + "2 Arizona 1937\n", + "3 Nevada 993\n", + "4 Washington 888\n", + "\n", + "\n", + "STATES WITH 500+ CUSTOMERS:\n", + "\n", + "\n", + " state customer_count\n", + "0 California 3552\n", + "1 Oregon 2909\n", + "2 Arizona 1937\n", + "3 Nevada 993\n", + "4 Washington 888\n", + "\n", + "There are 5 states with more than 500 customers\n" + ] + } + ], + "source": [ + "state_counts = df['state'].value_counts().reset_index()\n", + "state_counts.columns = ['state', 'customer_count']\n", + "\n", + "print(\"All states and their customer counts:\")\n", + "print(\"\\n\")\n", + "print(state_counts)\n", + "\n", + "# Filter for states with more than 500 customers\n", + "popular_states = state_counts[state_counts['customer_count'] > 500]\n", + "\n", + "print(\"\\n\")\n", + "print(\"STATES WITH 500+ CUSTOMERS:\")\n", + "print(\"\\n\")\n", + "print(popular_states)\n", + "\n", + "print(f\"\\nThere are {len(popular_states)} states with more than 500 customers\")" + ] + }, { "cell_type": "markdown", "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", @@ -68,6 +242,40 @@ "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "27b8ac80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " education gender max_clv min_clv median_clv\n", + "0 Bachelor F 73225.96 1904.00 5640.51\n", + "1 Bachelor M 67907.27 1898.01 5548.03\n", + "2 College F 61850.19 1898.68 5623.61\n", + "3 College M 61134.68 1918.12 6005.85\n", + "4 Doctor F 44856.11 2395.57 5332.46\n", + "5 Doctor M 32677.34 2267.60 5577.67\n", + "6 High School or Below F 55277.45 2144.92 6039.55\n", + "7 High School or Below M 83325.38 1940.98 6286.73\n", + "8 Master F 51016.07 2417.78 5729.86\n", + "9 Master M 50568.26 2272.31 5579.10\n" + ] + } + ], + "source": [ + "clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg([\n", + " ('max_clv', 'max'),\n", + " ('min_clv', 'min'), \n", + " ('median_clv', 'median')\n", + "]).round(2).reset_index()\n", + "\n", + "print(clv_stats)" + ] + }, { "cell_type": "markdown", "id": "b42999f9-311f-481e-ae63-40a5577072c5", @@ -143,7 +351,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -157,7 +365,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,