diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fadd718..23f1464 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -134,7 +134,1085 @@ }, "outputs": [], "source": [ - "# your code goes here" + "import pandas as pd\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df = pd.read_csv(url)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cf984ed4", + "metadata": {}, + "outputs": [], + "source": [ + "# EXERCISE 1\n", + "# convert to interger Total Claim Amount\n", + "df['Total Claim Amount'] = df['Total Claim Amount'].astype(int)\n", + "\n", + "# Delete null values in Total Claim Amount (there are none)\n", + "df = df.dropna(subset=['Total Claim Amount'])\n", + "\n", + "#filter df for total Claim amoun < 1000 & Response = 'Yes'\n", + "filtered_df = df[(df['Total Claim Amount'] < 1000) & (df['Response'] == 'Yes')]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f0c2c82", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Policy Type", + "rawType": "object", + "type": "string" + }, + { + "name": "Gender", + "rawType": "object", + "type": "string" + }, + { + "name": "Monthly Premium Auto", + "rawType": "float64", + "type": "float" + }, + { + "name": "Customer Lifetime Value", + "rawType": "float64", + "type": "float" + }, + { + "name": "Total Claim Amount", + "rawType": "float64", + "type": "float" + } + ], + "ref": "ef8616ac-f16f-4872-bfde-74f8940f2b01", + "rows": [ + [ + "0", + "Corporate Auto", + "F", + "94.30177514792899", + "7712.62873610651", + "433.29585798816566" + ], + [ + "1", + "Corporate Auto", + "M", + "92.18831168831169", + "7944.465413844156", + "408.1233766233766" + ], + [ + "2", + "Personal Auto", + "F", + "98.99814814814815", + "8339.791842237037", + "452.4981481481482" + ], + [ + "3", + "Personal Auto", + "M", + "91.08582089552239", + "7448.383280707089", + "456.5764925373134" + ], + [ + "4", + "Special Auto", + "F", + "92.31428571428572", + "7691.584111285713", + "452.85714285714283" + ], + [ + "5", + "Special Auto", + "M", + "86.34375", + "8247.08870234375", + "429.125" + ] + ], + "shape": { + "columns": 5, + "rows": 6 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Policy TypeGenderMonthly Premium AutoCustomer Lifetime ValueTotal Claim Amount
0Corporate AutoF94.3017757712.628736433.295858
1Corporate AutoM92.1883127944.465414408.123377
2Personal AutoF98.9981488339.791842452.498148
3Personal AutoM91.0858217448.383281456.576493
4Special AutoF92.3142867691.584111452.857143
5Special AutoM86.3437508247.088702429.125000
\n", + "
" + ], + "text/plain": [ + " Policy Type Gender Monthly Premium Auto Customer Lifetime Value \\\n", + "0 Corporate Auto F 94.301775 7712.628736 \n", + "1 Corporate Auto M 92.188312 7944.465414 \n", + "2 Personal Auto F 98.998148 8339.791842 \n", + "3 Personal Auto M 91.085821 7448.383281 \n", + "4 Special Auto F 92.314286 7691.584111 \n", + "5 Special Auto M 86.343750 8247.088702 \n", + "\n", + " Total Claim Amount \n", + "0 433.295858 \n", + "1 408.123377 \n", + "2 452.498148 \n", + "3 456.576493 \n", + "4 452.857143 \n", + "5 429.125000 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Exercise 2\n", + "# For response 'Yes' and group by Policy Type and Gender the average monthly premium and customer lifetime value\n", + "grouped_df = (\n", + "\tdf[df['Response'] == 'Yes']\n", + "\t.groupby(['Policy Type', 'Gender'])[['Monthly Premium Auto', 'Customer Lifetime Value','Total Claim Amount']]\n", + "\t.mean()\n", + "\t.reset_index()\n", + ")\n", + "grouped_df\n", + "\n", + "# Insights for Total Claim Amount:\n", + "# 1. Customers with Personal Policy Type have a lower average Total Claim Amount compared to those with Corporate Policy Type.\n", + "# 2. Female customers tend to have a lower Total Claim Amount than male customers across both policy types.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "184a9748", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "State", + "rawType": "object", + "type": "string" + }, + { + "name": "count", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "fce5c3e9-0ee0-4c5d-ad57-a65e12574740", + "rows": [ + [ + "California", + "3552" + ], + [ + "Oregon", + "2909" + ], + [ + "Arizona", + "1937" + ], + [ + "Nevada", + "993" + ], + [ + "Washington", + "888" + ] + ], + "shape": { + "columns": 1, + "rows": 5 + } + }, + "text/plain": [ + "State\n", + "California 3552\n", + "Oregon 2909\n", + "Arizona 1937\n", + "Nevada 993\n", + "Washington 888\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# EXERCISE 3. \n", + "\n", + "# Group by states with more than 500 customers\n", + "\n", + "state_customer_counts = df['State'].value_counts()\n", + "filtered_states = state_customer_counts[state_customer_counts > 500]\n", + "filtered_states\n", + "\n", + "# Insights:\n", + "# 1. The states with more than 500 customers are CA, WA, OR, AZ, and TX.\n", + "# 2. California (CA) has the highest number of customers, significantly more than any other state.\n", + "# 3. The distribution of customers across these states indicates a strong presence in the western region of the United States." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f9ee962", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Education", + "rawType": "object", + "type": "string" + }, + { + "name": "Gender", + "rawType": "object", + "type": "string" + }, + { + "name": "max", + "rawType": "float64", + "type": "float" + }, + { + "name": "min", + "rawType": "float64", + "type": "float" + }, + { + "name": "median", + "rawType": "float64", + "type": "float" + } + ], + "ref": "650d0b8d-0c8e-4657-9cee-c11b01a93d9c", + "rows": [ + [ + "0", + "Bachelor", + "F", + "73225.95652", + "1904.000852", + "5640.505303" + ], + [ + "1", + "Bachelor", + "M", + "67907.2705", + "1898.007675", + "5548.031892" + ], + [ + "2", + "College", + "F", + "61850.18803", + "1898.683686", + "5623.611187" + ], + [ + "3", + "College", + "M", + "61134.68307", + "1918.1197", + "6005.847375" + ], + [ + "4", + "Doctor", + "F", + "44856.11397", + "2395.57", + "5332.462694" + ], + [ + "5", + "Doctor", + "M", + "32677.34284", + "2267.604038", + "5577.669457" + ], + [ + "6", + "High School or Below", + "F", + "55277.44589", + "2144.921535", + "6039.5531869999995" + ], + [ + "7", + "High School or Below", + "M", + "83325.38119", + "1940.981221", + "6286.731006" + ], + [ + "8", + "Master", + "F", + "51016.06704", + "2417.777032", + "5729.855012" + ], + [ + "9", + "Master", + "M", + "50568.25912", + "2272.30731", + "5579.0992074999995" + ] + ], + "shape": { + "columns": 5, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EducationGendermaxminmedian
0BachelorF73225.956521904.0008525640.505303
1BachelorM67907.270501898.0076755548.031892
2CollegeF61850.188031898.6836865623.611187
3CollegeM61134.683071918.1197006005.847375
4DoctorF44856.113972395.5700005332.462694
5DoctorM32677.342842267.6040385577.669457
6High School or BelowF55277.445892144.9215356039.553187
7High School or BelowM83325.381191940.9812216286.731006
8MasterF51016.067042417.7770325729.855012
9MasterM50568.259122272.3073105579.099207
\n", + "
" + ], + "text/plain": [ + " Education Gender max min median\n", + "0 Bachelor F 73225.95652 1904.000852 5640.505303\n", + "1 Bachelor M 67907.27050 1898.007675 5548.031892\n", + "2 College F 61850.18803 1898.683686 5623.611187\n", + "3 College M 61134.68307 1918.119700 6005.847375\n", + "4 Doctor F 44856.11397 2395.570000 5332.462694\n", + "5 Doctor M 32677.34284 2267.604038 5577.669457\n", + "6 High School or Below F 55277.44589 2144.921535 6039.553187\n", + "7 High School or Below M 83325.38119 1940.981221 6286.731006\n", + "8 Master F 51016.06704 2417.777032 5729.855012\n", + "9 Master M 50568.25912 2272.307310 5579.099207" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Exercise 4\n", + "# Find the maximum, minimum, and median customer lifetime value by education level and gender\n", + "lifetime_value_stats = (\n", + " df.groupby(['Education', 'Gender'])['Customer Lifetime Value']\n", + " .agg(['max', 'min', 'median'])\n", + " .reset_index()\n", + ")\n", + "lifetime_value_stats\n", + "\n", + "#Conclusions:\n", + "# 1. Customers with a Doctorate degree tend to have the highest maximum Customer Lifetime Value across both genders.\n", + "# 2. The minimum Customer Lifetime Value is relatively consistent across different education levels, indicating a baseline value.\n", + "# 3. Median Customer Lifetime Value increases with higher education levels, suggesting that more educated customers may have a higher overall value to the company.\n", + "# 4. Gender differences are mixed across education levels — some groups show higher median CLV for females (e.g., Bachelor, Master) while others show higher median CLV for males (e.g., College, Doctor, High School); there is no consistent gender advantage overall.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f70ca8a7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "State", + "rawType": "object", + "type": "string" + }, + { + "name": "January", + "rawType": "int64", + "type": "integer" + }, + { + "name": "February", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "35455a08-7cb5-403e-9e6e-e51c4c34e77c", + "rows": [ + [ + "Arizona", + "3052", + "2864" + ], + [ + "California", + "5673", + "4929" + ], + [ + "Nevada", + "1493", + "1278" + ], + [ + "Oregon", + "4697", + "3969" + ], + [ + "Washington", + "1358", + "1225" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MonthJanuaryFebruary
State
Arizona30522864
California56734929
Nevada14931278
Oregon46973969
Washington13581225
\n", + "
" + ], + "text/plain": [ + "Month January February\n", + "State \n", + "Arizona 3052 2864\n", + "California 5673 4929\n", + "Nevada 1493 1278\n", + "Oregon 4697 3969\n", + "Washington 1358 1225" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Exercise 5 \n", + "# The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows.\n", + "\n", + "# First, ensure that the 'Effective To Date' column is in datetime format\n", + "df['Effective To Date'] = pd.to_datetime(df['Effective To Date'])\n", + "# Extract month from 'Effective To Date'\n", + "df['Month'] = df['Effective To Date'].dt.month_name()\n", + "# Create a pivot table with states as rows and months as columns\n", + "# The dataset column is named 'Number of Policies' — sum it to get total policies sold per state/month\n", + "policy_pivot = pd.pivot_table(\n", + " df,\n", + " index='State',\n", + " columns='Month',\n", + " values='Number of Policies',\n", + " aggfunc='sum',\n", + " fill_value=0\n", + ")\n", + "policy_pivot = policy_pivot.reindex(columns=[\n", + " 'January', 'February', \n", + "])\n", + "policy_pivot " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "aafe9740", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "State", + "rawType": "object", + "type": "string" + }, + { + "name": "Month", + "rawType": "object", + "type": "string" + }, + { + "name": "Number of Policies", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "d6a2c1b9-5d40-4ab0-a10f-71a1bdbc6c4d", + "rows": [ + [ + "3", + "California", + "January", + "5673" + ], + [ + "2", + "California", + "February", + "4929" + ], + [ + "5", + "Oregon", + "January", + "4697" + ], + [ + "4", + "Oregon", + "February", + "3969" + ], + [ + "1", + "Arizona", + "January", + "3052" + ], + [ + "0", + "Arizona", + "February", + "2864" + ] + ], + "shape": { + "columns": 3, + "rows": 6 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StateMonthNumber of Policies
3CaliforniaJanuary5673
2CaliforniaFebruary4929
5OregonJanuary4697
4OregonFebruary3969
1ArizonaJanuary3052
0ArizonaFebruary2864
\n", + "
" + ], + "text/plain": [ + " State Month Number of Policies\n", + "3 California January 5673\n", + "2 California February 4929\n", + "5 Oregon January 4697\n", + "4 Oregon February 3969\n", + "1 Arizona January 3052\n", + "0 Arizona February 2864" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#EXERCISE 6\n", + "# Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold. Without using a pivot table and sorting from higher to lower.\n", + "\n", + "# Get the top 3 states with the highest number of policies sold\n", + "top_states = df['State'].value_counts().head(3).index.tolist()\n", + "# Filter the DataFrame for these top states\n", + "top_states_df = df[df['State'].isin(top_states)]\n", + "# Group by State and Month, then count the number of policies sold\n", + "policies_by_state_month = (\n", + " top_states_df.groupby(['State', 'Month'])['Number of Policies']\n", + " .sum()\n", + " .reset_index()\n", + ")\n", + "policies_by_state_month = policies_by_state_month.sort_values(by=['Number of Policies'], ascending=[False])\n", + "policies_by_state_month\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "2817a6f0", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Marketing Channel", + "rawType": "object", + "type": "string" + }, + { + "name": "Response Rate", + "rawType": "float64", + "type": "float" + } + ], + "ref": "486d7b1e-bafd-4ca0-b645-9698eeb1de75", + "rows": [ + [ + "0", + "Agent", + "0.1800533851007037" + ], + [ + "1", + "Branch", + "0.10787557908669755" + ], + [ + "2", + "Call Center", + "0.10322279308734236" + ], + [ + "3", + "Web", + "0.1088560885608856" + ] + ], + "shape": { + "columns": 2, + "rows": 4 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Marketing ChannelResponse Rate
0Agent0.180053
1Branch0.107876
2Call Center0.103223
3Web0.108856
\n", + "
" + ], + "text/plain": [ + " Marketing Channel Response Rate\n", + "0 Agent 0.180053\n", + "1 Branch 0.107876\n", + "2 Call Center 0.103223\n", + "3 Web 0.108856" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#EXERCISE 7\n", + "# The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", + "\n", + "# Some datasets contain binary columns like 'Email', 'Phone', 'SMS', 'Mail' to indicate contact;\n", + "# this dataset does not. We'll first check for those columns and fallback to using 'Sales Channel'\n", + "# (which exists in this DataFrame) if the contact columns are not present.\n", + "\n", + "contact_cols = [c for c in ['Email', 'Phone', 'SMS', 'Mail'] if c in df.columns]\n", + "\n", + "if contact_cols:\n", + " # Melt the DataFrame to unpivot marketing channels if those columns exist\n", + " melted_df = pd.melt(\n", + " df,\n", + " id_vars=['Customer', 'Response'],\n", + " value_vars=contact_cols,\n", + " var_name='Marketing Channel',\n", + " value_name='Contacted'\n", + " )\n", + " # Filter for contacted customers\n", + " contacted_df = melted_df[melted_df['Contacted'] == 1]\n", + " # Calculate response rate by marketing channel\n", + " response_rate = (\n", + " contacted_df.groupby('Marketing Channel')['Response']\n", + " .apply(lambda x: (x == 'Yes').mean())\n", + " .reset_index(name='Response Rate')\n", + " )\n", + "else:\n", + " # Fallback: use 'Sales Channel' column to compute response rate per channel\n", + " response_rate = (\n", + " df.groupby('Sales Channel')['Response']\n", + " .apply(lambda x: (x == 'Yes').mean())\n", + " .reset_index(name='Response Rate')\n", + " .rename(columns={'Sales Channel': 'Marketing Channel'})\n", + " )\n", + "\n", + "response_rate" ] } ], @@ -143,7 +1221,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -157,7 +1235,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,