data-bootcamp-v4 · flowind0 · Sep 27, 2025
diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb
@@ -36,6 +36,77 @@
         "   - have a response \"Yes\" to the last marketing campaign."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "id": "3c78618b",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "   total_claim_amount response\n",
+            "0          292.800000       No\n",
+            "1          744.924331       No\n",
+            "2          480.000000       No\n",
+            "3          484.013411      Yes\n",
+            "4          707.925645       No\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "# Load the dataset from the URL\n",
+        "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n",
+        "df = pd.read_csv(url)\n",
+        "\n",
+        "# Standardize column names\n",
+        "df.columns = df.columns.str.lower().str.replace(' ', '_')\n",
+        "\n",
+        "print(df[['total_claim_amount', 'response']].head())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "e9b16011",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Filtered DataFrame (low claim + said Yes):\n",
+            "(1399, 26)\n",
+            "  customer  total_claim_amount response gender education\n",
+            "0  XL78013          484.013411      Yes      M   College\n",
+            "1  FM55990          739.200000      Yes      M   College\n",
+            "2  CW49887          547.200000      Yes      F    Master\n",
+            "3  NJ54277           19.575683      Yes      F   College\n",
+            "4  MQ68407           60.036683      Yes      F  Bachelor\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Clean the 'response' column (remove extra spaces and standardize)\n",
+        "df['response'] = df['response'].astype(str).str.strip().str.title()\n",
+        "\n",
+        "# Create the filtered DataFrame\n",
+        "low_claim_yes_response = df[\n",
+        "    (df['total_claim_amount'] < 1000) &\n",
+        "    (df['response'] == 'Yes')\n",
+        "].copy()\n",
+        "\n",
+        "# Reset index (optional)\n",
+        "low_claim_yes_response.reset_index(drop=True, inplace=True)\n",
+        "\n",
+        "print(\"Filtered DataFrame (low claim + said Yes):\")\n",
+        "print(low_claim_yes_response.shape)\n",
+        "print(low_claim_yes_response[['customer', 'total_claim_amount', 'response', 'gender', 'education']].head())   "
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b9be383e-5165-436e-80c8-57d4c757c8c3",
@@ -48,6 +119,55 @@
         "   - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "id": "c347fa94",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Simple Analysis by Policy Type:\n",
+            "\n",
+            "\n",
+            "                avg_premium  avg_claims  customer_count  profit\n",
+            "policy_type                                                    \n",
+            "Corporate Auto        93.29      421.74             323 -328.45\n",
+            "Personal Auto         95.06      454.98            1076 -359.92\n",
+            "Special Auto          89.46      441.94              67 -352.48\n",
+            "\n",
+            "Which policy type is most profitable?\n",
+            "\n",
+            "\n",
+            "Most profitable: Corporate Auto ($-328.45 profit per customer)\n"
+          ]
+        }
+      ],
+      "source": [
+        "yes_customers = df[df['response'] == 'Yes']\n",
+        "\n",
+        "# Simple group by policy type only (easier to understand)\n",
+        "simple_analysis = yes_customers.groupby('policy_type').agg({\n",
+        "    'monthly_premium_auto': 'mean',\n",
+        "    'total_claim_amount': 'mean',\n",
+        "    'customer': 'count'\n",
+        "}).round(2)\n",
+        "\n",
+        "simple_analysis.columns = ['avg_premium', 'avg_claims', 'customer_count']\n",
+        "simple_analysis['profit'] = simple_analysis['avg_premium'] - simple_analysis['avg_claims']\n",
+        "\n",
+        "print(\"Simple Analysis by Policy Type:\")\n",
+        "print(\"\\n\")\n",
+        "print(simple_analysis)\n",
+        "\n",
+        "print(\"\\nWhich policy type is most profitable?\") \n",
+        "print(\"\\n\")\n",
+        "most_profitable = simple_analysis['profit'].idxmax()\n",
+        "print(f\"Most profitable: {most_profitable} (${simple_analysis.loc[most_profitable, 'profit']:.2f} profit per customer)\")"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0",
@@ -58,6 +178,60 @@
         "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "id": "afa91674",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "All states and their customer counts:\n",
+            "\n",
+            "\n",
+            "        state  customer_count\n",
+            "0  California            3552\n",
+            "1      Oregon            2909\n",
+            "2     Arizona            1937\n",
+            "3      Nevada             993\n",
+            "4  Washington             888\n",
+            "\n",
+            "\n",
+            "STATES WITH 500+ CUSTOMERS:\n",
+            "\n",
+            "\n",
+            "        state  customer_count\n",
+            "0  California            3552\n",
+            "1      Oregon            2909\n",
+            "2     Arizona            1937\n",
+            "3      Nevada             993\n",
+            "4  Washington             888\n",
+            "\n",
+            "There are 5 states with more than 500 customers\n"
+          ]
+        }
+      ],
+      "source": [
+        "state_counts = df['state'].value_counts().reset_index()\n",
+        "state_counts.columns = ['state', 'customer_count']\n",
+        "\n",
+        "print(\"All states and their customer counts:\")\n",
+        "print(\"\\n\")\n",
+        "print(state_counts)\n",
+        "\n",
+        "# Filter for states with more than 500 customers\n",
+        "popular_states = state_counts[state_counts['customer_count'] > 500]\n",
+        "\n",
+        "print(\"\\n\")\n",
+        "print(\"STATES WITH 500+ CUSTOMERS:\")\n",
+        "print(\"\\n\")\n",
+        "print(popular_states)\n",
+        "\n",
+        "print(f\"\\nThere are {len(popular_states)} states with more than 500 customers\")"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d",
@@ -68,6 +242,40 @@
         "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "id": "27b8ac80",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "              education gender   max_clv  min_clv  median_clv\n",
+            "0              Bachelor      F  73225.96  1904.00     5640.51\n",
+            "1              Bachelor      M  67907.27  1898.01     5548.03\n",
+            "2               College      F  61850.19  1898.68     5623.61\n",
+            "3               College      M  61134.68  1918.12     6005.85\n",
+            "4                Doctor      F  44856.11  2395.57     5332.46\n",
+            "5                Doctor      M  32677.34  2267.60     5577.67\n",
+            "6  High School or Below      F  55277.45  2144.92     6039.55\n",
+            "7  High School or Below      M  83325.38  1940.98     6286.73\n",
+            "8                Master      F  51016.07  2417.78     5729.86\n",
+            "9                Master      M  50568.26  2272.31     5579.10\n"
+          ]
+        }
+      ],
+      "source": [
+        "clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg([\n",
+        "    ('max_clv', 'max'),\n",
+        "    ('min_clv', 'min'), \n",
+        "    ('median_clv', 'median')\n",
+        "]).round(2).reset_index()\n",
+        "\n",
+        "print(clv_stats)"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b42999f9-311f-481e-ae63-40a5577072c5",
@@ -143,7 +351,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -157,7 +365,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,