diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fadd718..d9b14bc 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -36,6 +36,254 @@ " - have a response \"Yes\" to the last marketing campaign." ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7766cfde", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "18ddaef0", + "metadata": {}, + "outputs": [], + "source": [ + "df.columns = df.columns.str.lower().str.replace(\" \", \"_\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1155c44c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_open_complaintsnumber_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_type
33XL78013Oregon22332.439460YesExtendedCollege1/11/11EmployedM...0.02Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA
88FM55990California5989.773931YesPremiumCollege1/19/11EmployedM...0.01Personal AutoPersonal L1Offer2Branch739.200000Sports CarMedsizeNaN
1515CW49887California4626.801093YesBasicMaster1/16/11EmployedF...0.01Special AutoSpecial L1Offer2Branch547.200000SUVMedsizeNaN
1919NJ54277California3746.751625YesExtendedCollege2/26/11EmployedF...1.01Personal AutoPersonal L2Offer2Call Center19.575683Two-Door CarLargeA
2727MQ68407Oregon4376.363592YesPremiumBachelor2/28/11EmployedF...0.01Personal AutoPersonal L3Offer2Agent60.036683Four-Door CarMedsizeNaN
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "8 8 FM55990 California 5989.773931 Yes \n", + "15 15 CW49887 California 4626.801093 Yes \n", + "19 19 NJ54277 California 3746.751625 Yes \n", + "27 27 MQ68407 Oregon 4376.363592 Yes \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "3 Extended College 1/11/11 Employed M ... \n", + "8 Premium College 1/19/11 Employed M ... \n", + "15 Basic Master 1/16/11 Employed F ... \n", + "19 Extended College 2/26/11 Employed F ... \n", + "27 Premium Bachelor 2/28/11 Employed F ... \n", + "\n", + " number_of_open_complaints number_of_policies policy_type \\\n", + "3 0.0 2 Corporate Auto \n", + "8 0.0 1 Personal Auto \n", + "15 0.0 1 Special Auto \n", + "19 1.0 1 Personal Auto \n", + "27 0.0 1 Personal Auto \n", + "\n", + " policy renew_offer_type sales_channel total_claim_amount \\\n", + "3 Corporate L3 Offer2 Branch 484.013411 \n", + "8 Personal L1 Offer2 Branch 739.200000 \n", + "15 Special L1 Offer2 Branch 547.200000 \n", + "19 Personal L2 Offer2 Call Center 19.575683 \n", + "27 Personal L3 Offer2 Agent 60.036683 \n", + "\n", + " vehicle_class vehicle_size vehicle_type \n", + "3 Four-Door Car Medsize A \n", + "8 Sports Car Medsize NaN \n", + "15 SUV Medsize NaN \n", + "19 Two-Door Car Large A \n", + "27 Four-Door Car Medsize NaN \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "low_claim_yes = df[(df['total_claim_amount'] < 1000) & (df['response'] == \"Yes\")]\n", + "low_claim_yes.head()" + ] + }, { "cell_type": "markdown", "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", @@ -48,6 +296,112 @@ " - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company." ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bea60123", + "metadata": {}, + "outputs": [], + "source": [ + "yes_responders = df[df['response'] == \"Yes\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e9102999", + "metadata": {}, + "outputs": [], + "source": [ + "grouped = yes_responders.groupby(['policy_type', 'gender'])[['monthly_premium_auto', 'customer_lifetime_value']].mean().reset_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d2b66241", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typegendertotal_claim_amount
0Corporate AutoF433.738499
1Corporate AutoM408.582459
2Personal AutoF452.965929
3Personal AutoM457.010178
4Special AutoF453.280164
\n", + "
" + ], + "text/plain": [ + " policy_type gender total_claim_amount\n", + "0 Corporate Auto F 433.738499\n", + "1 Corporate Auto M 408.582459\n", + "2 Personal Auto F 452.965929\n", + "3 Personal Auto M 457.010178\n", + "4 Special Auto F 453.280164" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "claim_pattern = yes_responders.groupby(['policy_type', 'gender'])['total_claim_amount'].mean().reset_index()\n", + "claim_pattern.head()" + ] + }, { "cell_type": "markdown", "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", @@ -58,6 +412,17 @@ "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "616bcc9a", + "metadata": {}, + "outputs": [], + "source": [ + "state_counts = df['state'].value_counts()\n", + "states_over_500 = state_counts[state_counts > 500]\n" + ] + }, { "cell_type": "markdown", "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", @@ -68,6 +433,113 @@ "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4bcf8d06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationgendermaxminmedian
0BachelorF73225.956521904.0008525640.505303
1BachelorM67907.270501898.0076755548.031892
2CollegeF61850.188031898.6836865623.611187
3CollegeM61134.683071918.1197006005.847375
4DoctorF44856.113972395.5700005332.462694
\n", + "
" + ], + "text/plain": [ + " education gender max min median\n", + "0 Bachelor F 73225.95652 1904.000852 5640.505303\n", + "1 Bachelor M 67907.27050 1898.007675 5548.031892\n", + "2 College F 61850.18803 1898.683686 5623.611187\n", + "3 College M 61134.68307 1918.119700 6005.847375\n", + "4 Doctor F 44856.11397 2395.570000 5332.462694" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clv_stats = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg(['max', 'min', 'median']).reset_index()\n", + "clv_stats.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0d6e712c", + "metadata": {}, + "source": [ + "Doctorate-level customers show a lower maximum CLV\n", + "Customers with a Bachelor's degree tend to have the highest maximum CLV" + ] + }, { "cell_type": "markdown", "id": "b42999f9-311f-481e-ae63-40a5577072c5", @@ -88,6 +560,112 @@ "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2eb4fabe", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\levgi\\AppData\\Local\\Temp\\ipykernel_11876\\3086097951.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n" + ] + } + ], + "source": [ + "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n", + "df['month'] = df['effective_to_date'].dt.strftime('%B')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ab84c1ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthFebruaryJanuary
state
Arizona9291008
California16341918
Nevada442551
Oregon13441565
Washington425463
\n", + "
" + ], + "text/plain": [ + "month February January\n", + "state \n", + "Arizona 929 1008\n", + "California 1634 1918\n", + "Nevada 442 551\n", + "Oregon 1344 1565\n", + "Washington 425 463" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "policies_by_state_month = df.pivot_table(index='state', columns='month', values='policy', aggfunc='count', fill_value=0)\n", + "policies_by_state_month.head()" + ] + }, { "cell_type": "markdown", "id": "b6aec097-c633-4017-a125-e77a97259cda", @@ -103,6 +681,93 @@ "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" ] }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e43cdb63", + "metadata": {}, + "outputs": [], + "source": [ + "top_states = df['state'].value_counts().nlargest(3).index\n", + "top_states = top_states.sort_values(ascending=False)\n", + "top_states_df = df[df['state'].isin(top_states)]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9713939d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthFebruaryJanuary
state
Arizona9291008
California16341918
Oregon13441565
\n", + "
" + ], + "text/plain": [ + "month February January\n", + "state \n", + "Arizona 929 1008\n", + "California 1634 1918\n", + "Oregon 1344 1565" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_top = top_states_df.groupby(['state', 'month'])['policy'].count().reset_index()\n", + "top3_pivot = grouped_top.pivot(index='state', columns='month', values='policy').fillna(0)\n", + "top3_pivot.head()" + ] + }, { "cell_type": "markdown", "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", @@ -127,14 +792,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "449513f4-0459-46a0-a18d-9398d974c9ad", "metadata": { "id": "449513f4-0459-46a0-a18d-9398d974c9ad" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "sales_channel\n", + "Agent 0.190746\n", + "Branch 0.113787\n", + "Call Center 0.109786\n", + "Web 0.117141\n", + "Name: Yes, dtype: float64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code goes here" + "response_rate = df.groupby('sales_channel')['response'].value_counts(normalize=True).unstack().fillna(0)['Yes']\n", + "response_rate" ] } ], @@ -143,7 +825,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -157,7 +839,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.7" } }, "nbformat": 4,