From 561d209a55c920c5e8db7903b69f311674a934c8 Mon Sep 17 00:00:00 2001
From: davherdel <davherdel@gmail.com>
Date: Sat, 16 Aug 2025 15:51:21 +0100
Subject: [PATCH] Uploaded finished notebook

---
 lab-dw-aggregating.ipynb | 285 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 283 insertions(+), 2 deletions(-)

diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb
index fadd718..5f21188 100644
--- a/lab-dw-aggregating.ipynb
+++ b/lab-dw-aggregating.ipynb
@@ -36,6 +36,105 @@
         "   - have a response \"Yes\" to the last marketing campaign."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "id": "2ca88bdd",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Index(['Unnamed: 0', 'Customer', 'State', 'Customer Lifetime Value',\n",
+            "       'Response', 'Coverage', 'Education', 'Effective To Date',\n",
+            "       'EmploymentStatus', 'Gender', 'Income', 'Location Code',\n",
+            "       'Marital Status', 'Monthly Premium Auto', 'Months Since Last Claim',\n",
+            "       'Months Since Policy Inception', 'Number of Open Complaints',\n",
+            "       'Number of Policies', 'Policy Type', 'Policy', 'Renew Offer Type',\n",
+            "       'Sales Channel', 'Total Claim Amount', 'Vehicle Class', 'Vehicle Size',\n",
+            "       'Vehicle Type'],\n",
+            "      dtype='object')\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "# Load the dataset from the URL\n",
+        "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n",
+        "df = pd.read_csv(url)\n",
+        "\n",
+        "# Take a quick look at the columns to understand the data structure\n",
+        "print(df.columns)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "f1acdee1",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of customers with total_claim_amount < 1000 and response 'Yes': 1399\n",
+            "    unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+            "3            3  XL78013      Oregon             22332.439460      Yes   \n",
+            "8            8  FM55990  California              5989.773931      Yes   \n",
+            "15          15  CW49887  California              4626.801093      Yes   \n",
+            "19          19  NJ54277  California              3746.751625      Yes   \n",
+            "27          27  MQ68407      Oregon              4376.363592      Yes   \n",
+            "\n",
+            "    coverage education effective_to_date employmentstatus gender  ...  \\\n",
+            "3   Extended   College           1/11/11         Employed      M  ...   \n",
+            "8    Premium   College           1/19/11         Employed      M  ...   \n",
+            "15     Basic    Master           1/16/11         Employed      F  ...   \n",
+            "19  Extended   College           2/26/11         Employed      F  ...   \n",
+            "27   Premium  Bachelor           2/28/11         Employed      F  ...   \n",
+            "\n",
+            "    number_of_open_complaints number_of_policies     policy_type  \\\n",
+            "3                         0.0                  2  Corporate Auto   \n",
+            "8                         0.0                  1   Personal Auto   \n",
+            "15                        0.0                  1    Special Auto   \n",
+            "19                        1.0                  1   Personal Auto   \n",
+            "27                        0.0                  1   Personal Auto   \n",
+            "\n",
+            "          policy  renew_offer_type  sales_channel  total_claim_amount  \\\n",
+            "3   Corporate L3            Offer2         Branch          484.013411   \n",
+            "8    Personal L1            Offer2         Branch          739.200000   \n",
+            "15    Special L1            Offer2         Branch          547.200000   \n",
+            "19   Personal L2            Offer2    Call Center           19.575683   \n",
+            "27   Personal L3            Offer2          Agent           60.036683   \n",
+            "\n",
+            "    vehicle_class vehicle_size vehicle_type  \n",
+            "3   Four-Door Car      Medsize            A  \n",
+            "8      Sports Car      Medsize          NaN  \n",
+            "15            SUV      Medsize          NaN  \n",
+            "19   Two-Door Car        Large            A  \n",
+            "27  Four-Door Car      Medsize          NaN  \n",
+            "\n",
+            "[5 rows x 26 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Standardizing column names for easier access (lowercase, replace spaces with underscores)\n",
+        "df.columns = df.columns.str.lower().str.replace(' ', '_')\n",
+        "\n",
+        "# Filter the dataset to include only customers with:\n",
+        "# total_claim_amount below 1000\n",
+        "# response to the last marketing campaign is \"Yes\"\n",
+        "filtered_df = df[(df['total_claim_amount'] < 1000) & (df['response'] == 'Yes')]\n",
+        "\n",
+        "# Check the shape to see how many customers meet this criteria\n",
+        "print(f\"Number of customers with total_claim_amount < 1000 and response 'Yes': {filtered_df.shape[0]}\")\n",
+        "\n",
+        "# Preview the filtered DataFrame\n",
+        "print(filtered_df.head())"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b9be383e-5165-436e-80c8-57d4c757c8c3",
@@ -48,6 +147,65 @@
         "   - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "id": "639f9ffd",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Average Monthly Premium and Customer Lifetime Value by Policy Type and Gender:\n",
+            "\n",
+            "                       customer_lifetime_value  monthly_premium_auto\n",
+            "policy_type    gender                                               \n",
+            "Corporate Auto F                       7712.63                 94.30\n",
+            "               M                       7944.47                 92.19\n",
+            "Personal Auto  F                       8339.79                 99.00\n",
+            "               M                       7448.38                 91.09\n",
+            "Special Auto   F                       7691.58                 92.31\n",
+            "               M                       8247.09                 86.34\n",
+            "\n",
+            "Average Total Claim Amount by Policy Type and Gender:\n",
+            "\n",
+            "                       total_claim_amount\n",
+            "policy_type    gender                    \n",
+            "Corporate Auto F                   433.74\n",
+            "               M                   408.58\n",
+            "Personal Auto  F                   452.97\n",
+            "               M                   457.01\n",
+            "Special Auto   F                   453.28\n",
+            "               M                   429.53\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Filter customers who responded \"Yes\"\n",
+        "responded_yes = df[df['response'] == 'Yes']\n",
+        "\n",
+        "# Pivot table: average monthly premium and customer lifetime value by policy_type and gender\n",
+        "pivot_avg = responded_yes.pivot_table(\n",
+        "    index=['policy_type', 'gender'],\n",
+        "    values=['monthly_premium_auto', 'customer_lifetime_value'],\n",
+        "    aggfunc='mean'\n",
+        ").round(2)\n",
+        "\n",
+        "print(\"Average Monthly Premium and Customer Lifetime Value by Policy Type and Gender:\\n\")\n",
+        "print(pivot_avg)\n",
+        "\n",
+        "# Pivot table: average total claim amount by policy_type and gender\n",
+        "pivot_claims = responded_yes.pivot_table(\n",
+        "    index=['policy_type', 'gender'],\n",
+        "    values='total_claim_amount',\n",
+        "    aggfunc='mean'\n",
+        ").round(2)\n",
+        "\n",
+        "print(\"\\nAverage Total Claim Amount by Policy Type and Gender:\\n\")\n",
+        "print(pivot_claims)\n"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0",
@@ -58,6 +216,38 @@
         "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "9c0902a3",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of customers per state (only states with > 500 customers):\n",
+            "state\n",
+            "California    3552\n",
+            "Oregon        2909\n",
+            "Arizona       1937\n",
+            "Nevada         993\n",
+            "Washington     888\n",
+            "Name: count, dtype: int64\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Count customers per state\n",
+        "customers_per_state = df['state'].value_counts()\n",
+        "\n",
+        "# Filter states with more than 500 customers\n",
+        "states_over_500 = customers_per_state[customers_per_state > 500]\n",
+        "\n",
+        "print(\"Number of customers per state (only states with > 500 customers):\")\n",
+        "print(states_over_500)"
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d",
@@ -68,6 +258,97 @@
         "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions."
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "f7ebc1eb",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                                     max          min       median\n",
+            "education            gender                                       \n",
+            "Bachelor             F       73225.95652  1904.000852  5640.505303\n",
+            "                     M       67907.27050  1898.007675  5548.031892\n",
+            "College              F       61850.18803  1898.683686  5623.611187\n",
+            "                     M       61134.68307  1918.119700  6005.847375\n",
+            "Doctor               F       44856.11397  2395.570000  5332.462694\n",
+            "                     M       32677.34284  2267.604038  5577.669457\n",
+            "High School or Below F       55277.44589  2144.921535  6039.553187\n",
+            "                     M       83325.38119  1940.981221  6286.731006\n",
+            "Master               F       51016.06704  2417.777032  5729.855012\n",
+            "                     M       50568.25912  2272.307310  5579.099207\n"
+          ]
+        }
+      ],
+      "source": [
+        "clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg(['max', 'min', 'median'])\n",
+        "print(clv_stats)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "id": "890e60aa",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                                   max       min    median\n",
+            "education            gender                               \n",
+            "Bachelor             F       73,225.96  1,904.00  5,640.51\n",
+            "                     M       67,907.27  1,898.01  5,548.03\n",
+            "College              F       61,850.19  1,898.68  5,623.61\n",
+            "                     M       61,134.68  1,918.12  6,005.85\n",
+            "Doctor               F       44,856.11  2,395.57  5,332.46\n",
+            "                     M       32,677.34  2,267.60  5,577.67\n",
+            "High School or Below F       55,277.45  2,144.92  6,039.55\n",
+            "                     M       83,325.38  1,940.98  6,286.73\n",
+            "Master               F       51,016.07  2,417.78  5,729.86\n",
+            "                     M       50,568.26  2,272.31  5,579.10\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "C:\\Users\\jdhernandezd\\AppData\\Local\\Temp\\ipykernel_5112\\4277738231.py:5: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
+            "  clv_stats_formatted = clv_stats_rounded.applymap(lambda x: f\"{x:,.2f}\")\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Making numbers more readable\n",
+        "clv_stats_rounded = clv_stats.round(2)\n",
+        "\n",
+        "# Format with thousands separators\n",
+        "clv_stats_formatted = clv_stats_rounded.applymap(lambda x: f\"{x:,.2f}\")\n",
+        "\n",
+        "print(clv_stats_formatted)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "26e0122b",
+      "metadata": {},
+      "source": [
+        "### Conclusion\n",
+        "\n",
+        "1) The highest maximum customer lifetime value (CLV) is found among males with \"High School or Below\" education, indicating that higher education does not necessarily correlate with higher maximum CLV in this dataset.\n",
+        "\n",
+        "2) Median CLV values are relatively consistent across education levels, with the \"High School or Below\" group showing some of the highest median CLVs, suggesting stable customer value in this segment.\n",
+        "\n",
+        "3) Customers with a Doctorate degree have the lowest maximum CLV, which might be influenced by smaller sample size or different customer behaviors.\n",
+        "\n",
+        "4) Gender differences are present but not pronounced; both males and females exhibit similar median CLVs across education levels.\n",
+        "\n",
+        "In general, education level by itself does not strongly determine customer lifetime value in this dataset. Notably, some groups with lower formal education exhibit high and steady CLVs, emphasizing the need to evaluate additional variables when analyzing customer value."
+      ]
+    },
     {
       "cell_type": "markdown",
       "id": "b42999f9-311f-481e-ae63-40a5577072c5",
@@ -143,7 +424,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -157,7 +438,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.12.7"
     }
   },
   "nbformat": 4,