From 561d209a55c920c5e8db7903b69f311674a934c8 Mon Sep 17 00:00:00 2001 From: davherdel Date: Sat, 16 Aug 2025 15:51:21 +0100 Subject: [PATCH] Uploaded finished notebook --- lab-dw-aggregating.ipynb | 285 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 283 insertions(+), 2 deletions(-) diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fadd718..5f21188 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -36,6 +36,105 @@ " - have a response \"Yes\" to the last marketing campaign." ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2ca88bdd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Unnamed: 0', 'Customer', 'State', 'Customer Lifetime Value',\n", + " 'Response', 'Coverage', 'Education', 'Effective To Date',\n", + " 'EmploymentStatus', 'Gender', 'Income', 'Location Code',\n", + " 'Marital Status', 'Monthly Premium Auto', 'Months Since Last Claim',\n", + " 'Months Since Policy Inception', 'Number of Open Complaints',\n", + " 'Number of Policies', 'Policy Type', 'Policy', 'Renew Offer Type',\n", + " 'Sales Channel', 'Total Claim Amount', 'Vehicle Class', 'Vehicle Size',\n", + " 'Vehicle Type'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Load the dataset from the URL\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "# Take a quick look at the columns to understand the data structure\n", + "print(df.columns)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f1acdee1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of customers with total_claim_amount < 1000 and response 'Yes': 1399\n", + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "8 8 FM55990 California 5989.773931 Yes \n", + "15 15 CW49887 California 4626.801093 Yes \n", + "19 19 NJ54277 California 3746.751625 Yes \n", + "27 27 MQ68407 Oregon 4376.363592 Yes \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "3 Extended College 1/11/11 Employed M ... \n", + "8 Premium College 1/19/11 Employed M ... \n", + "15 Basic Master 1/16/11 Employed F ... \n", + "19 Extended College 2/26/11 Employed F ... \n", + "27 Premium Bachelor 2/28/11 Employed F ... \n", + "\n", + " number_of_open_complaints number_of_policies policy_type \\\n", + "3 0.0 2 Corporate Auto \n", + "8 0.0 1 Personal Auto \n", + "15 0.0 1 Special Auto \n", + "19 1.0 1 Personal Auto \n", + "27 0.0 1 Personal Auto \n", + "\n", + " policy renew_offer_type sales_channel total_claim_amount \\\n", + "3 Corporate L3 Offer2 Branch 484.013411 \n", + "8 Personal L1 Offer2 Branch 739.200000 \n", + "15 Special L1 Offer2 Branch 547.200000 \n", + "19 Personal L2 Offer2 Call Center 19.575683 \n", + "27 Personal L3 Offer2 Agent 60.036683 \n", + "\n", + " vehicle_class vehicle_size vehicle_type \n", + "3 Four-Door Car Medsize A \n", + "8 Sports Car Medsize NaN \n", + "15 SUV Medsize NaN \n", + "19 Two-Door Car Large A \n", + "27 Four-Door Car Medsize NaN \n", + "\n", + "[5 rows x 26 columns]\n" + ] + } + ], + "source": [ + "# Standardizing column names for easier access (lowercase, replace spaces with underscores)\n", + "df.columns = df.columns.str.lower().str.replace(' ', '_')\n", + "\n", + "# Filter the dataset to include only customers with:\n", + "# total_claim_amount below 1000\n", + "# response to the last marketing campaign is \"Yes\"\n", + "filtered_df = df[(df['total_claim_amount'] < 1000) & (df['response'] == 'Yes')]\n", + "\n", + "# Check the shape to see how many customers meet this criteria\n", + "print(f\"Number of customers with total_claim_amount < 1000 and response 'Yes': {filtered_df.shape[0]}\")\n", + "\n", + "# Preview the filtered DataFrame\n", + "print(filtered_df.head())" + ] + }, { "cell_type": "markdown", "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", @@ -48,6 +147,65 @@ " - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company." ] }, + { + "cell_type": "code", + "execution_count": 14, + "id": "639f9ffd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Monthly Premium and Customer Lifetime Value by Policy Type and Gender:\n", + "\n", + " customer_lifetime_value monthly_premium_auto\n", + "policy_type gender \n", + "Corporate Auto F 7712.63 94.30\n", + " M 7944.47 92.19\n", + "Personal Auto F 8339.79 99.00\n", + " M 7448.38 91.09\n", + "Special Auto F 7691.58 92.31\n", + " M 8247.09 86.34\n", + "\n", + "Average Total Claim Amount by Policy Type and Gender:\n", + "\n", + " total_claim_amount\n", + "policy_type gender \n", + "Corporate Auto F 433.74\n", + " M 408.58\n", + "Personal Auto F 452.97\n", + " M 457.01\n", + "Special Auto F 453.28\n", + " M 429.53\n" + ] + } + ], + "source": [ + "# Filter customers who responded \"Yes\"\n", + "responded_yes = df[df['response'] == 'Yes']\n", + "\n", + "# Pivot table: average monthly premium and customer lifetime value by policy_type and gender\n", + "pivot_avg = responded_yes.pivot_table(\n", + " index=['policy_type', 'gender'],\n", + " values=['monthly_premium_auto', 'customer_lifetime_value'],\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "print(\"Average Monthly Premium and Customer Lifetime Value by Policy Type and Gender:\\n\")\n", + "print(pivot_avg)\n", + "\n", + "# Pivot table: average total claim amount by policy_type and gender\n", + "pivot_claims = responded_yes.pivot_table(\n", + " index=['policy_type', 'gender'],\n", + " values='total_claim_amount',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "print(\"\\nAverage Total Claim Amount by Policy Type and Gender:\\n\")\n", + "print(pivot_claims)\n" + ] + }, { "cell_type": "markdown", "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", @@ -58,6 +216,38 @@ "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9c0902a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of customers per state (only states with > 500 customers):\n", + "state\n", + "California 3552\n", + "Oregon 2909\n", + "Arizona 1937\n", + "Nevada 993\n", + "Washington 888\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Count customers per state\n", + "customers_per_state = df['state'].value_counts()\n", + "\n", + "# Filter states with more than 500 customers\n", + "states_over_500 = customers_per_state[customers_per_state > 500]\n", + "\n", + "print(\"Number of customers per state (only states with > 500 customers):\")\n", + "print(states_over_500)" + ] + }, { "cell_type": "markdown", "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", @@ -68,6 +258,97 @@ "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f7ebc1eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " max min median\n", + "education gender \n", + "Bachelor F 73225.95652 1904.000852 5640.505303\n", + " M 67907.27050 1898.007675 5548.031892\n", + "College F 61850.18803 1898.683686 5623.611187\n", + " M 61134.68307 1918.119700 6005.847375\n", + "Doctor F 44856.11397 2395.570000 5332.462694\n", + " M 32677.34284 2267.604038 5577.669457\n", + "High School or Below F 55277.44589 2144.921535 6039.553187\n", + " M 83325.38119 1940.981221 6286.731006\n", + "Master F 51016.06704 2417.777032 5729.855012\n", + " M 50568.25912 2272.307310 5579.099207\n" + ] + } + ], + "source": [ + "clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg(['max', 'min', 'median'])\n", + "print(clv_stats)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "890e60aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " max min median\n", + "education gender \n", + "Bachelor F 73,225.96 1,904.00 5,640.51\n", + " M 67,907.27 1,898.01 5,548.03\n", + "College F 61,850.19 1,898.68 5,623.61\n", + " M 61,134.68 1,918.12 6,005.85\n", + "Doctor F 44,856.11 2,395.57 5,332.46\n", + " M 32,677.34 2,267.60 5,577.67\n", + "High School or Below F 55,277.45 2,144.92 6,039.55\n", + " M 83,325.38 1,940.98 6,286.73\n", + "Master F 51,016.07 2,417.78 5,729.86\n", + " M 50,568.26 2,272.31 5,579.10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\jdhernandezd\\AppData\\Local\\Temp\\ipykernel_5112\\4277738231.py:5: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n", + " clv_stats_formatted = clv_stats_rounded.applymap(lambda x: f\"{x:,.2f}\")\n" + ] + } + ], + "source": [ + "# Making numbers more readable\n", + "clv_stats_rounded = clv_stats.round(2)\n", + "\n", + "# Format with thousands separators\n", + "clv_stats_formatted = clv_stats_rounded.applymap(lambda x: f\"{x:,.2f}\")\n", + "\n", + "print(clv_stats_formatted)\n" + ] + }, + { + "cell_type": "markdown", + "id": "26e0122b", + "metadata": {}, + "source": [ + "### Conclusion\n", + "\n", + "1) The highest maximum customer lifetime value (CLV) is found among males with \"High School or Below\" education, indicating that higher education does not necessarily correlate with higher maximum CLV in this dataset.\n", + "\n", + "2) Median CLV values are relatively consistent across education levels, with the \"High School or Below\" group showing some of the highest median CLVs, suggesting stable customer value in this segment.\n", + "\n", + "3) Customers with a Doctorate degree have the lowest maximum CLV, which might be influenced by smaller sample size or different customer behaviors.\n", + "\n", + "4) Gender differences are present but not pronounced; both males and females exhibit similar median CLVs across education levels.\n", + "\n", + "In general, education level by itself does not strongly determine customer lifetime value in this dataset. Notably, some groups with lower formal education exhibit high and steady CLVs, emphasizing the need to evaluate additional variables when analyzing customer value." + ] + }, { "cell_type": "markdown", "id": "b42999f9-311f-481e-ae63-40a5577072c5", @@ -143,7 +424,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -157,7 +438,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,