From cbcca474d316a689c6ea56dc740023c1318de03d Mon Sep 17 00:00:00 2001 From: Rui Braz Date: Sat, 20 Sep 2025 15:55:44 +0100 Subject: [PATCH] Solved lab --- .DS_Store | Bin 0 -> 6148 bytes .../cleaning_functions.cpython-313.pyc | Bin 0 -> 3717 bytes lab-dw-aggregating.ipynb | 1510 +++++++++++++++-- 3 files changed, 1372 insertions(+), 138 deletions(-) create mode 100644 .DS_Store create mode 100644 __pycache__/cleaning_functions.cpython-313.pyc diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T07&-72aJgN&Q&QQ+Fd#q_(QaG>%YPv0X>DWXl%iRBOd66ais9PjqN6x^5js3q$(xGpuA0Qqsj0wRr3}xq zQ!oiNTw>BR6BQ=rm_$7mrk?JZw^WHm12dFGEvIDbhN;S=3tVGiZj42V$-7yEn79=-ys)1`G5CYz~UH(vgp#FBW5gYbn!<)v|nSaDGf1dDd3{HY{A%Xj%zba; z_I-EDtIqiHn>)RI)tUc13k$KUB1NmeQQz*D_XSS`dU^0vAfzA=;gaDe!Y`B#IQYi4AdWYCaaIa@1TD;Gx5Asx3&epT715%tkV0Lwv4|e=Hjh`;(FWUVXkA+O zIjN-u&mO*VZ-)YWYQ1~>>}mQL%noUhuchj38) zDpxOKkWj^P42TbldZyzk@9B0KApsG;*@4&f9u1U(heIRDdiv zwp?&*IXt!WdD}H~n_h&MVB$$oqeyyHp;lh2-F?4ae!rGC8hOjlTeY7R>v^Z8y2*K!MQmMdtJU0jD1n4*5+%4O7bQaMmUK5z%7t8=-?{!35crjG$IU~+Kg`*pJ|WJ_{VNmukXZ< z-(Pufz7Zet<3pR*>+wu=vbuI}7G@5oYN<SQgd z`0{H{63I`pA7vj6*2d@jH*VJM{M1kAtwFGnQfulJKRMM%&icvO$G7Xr1somp<)NMA z$lkHPY4zlf8p+#!^7hw)@Z;dTQqxjv`jh0yC&iD7k0xtBnD^g=$+ub~Um$IfVc6il zP{L^7RJ8_o&huQ8exSeIIDv(NEYqYe)ahsd*?`v2W=Ct~z<|p3fVD+&5mY)TdxF(q zEZSibK3i5>_Am=-(U({X#-MKKYzJEEdWofA>>K)YAS5NAS`j@4qUxz!ze3y+Rm@_^ zTUUs?+N384a;gxZ77H_-GN;eA*lX4DRul+&P$L$kW~FHAw9K=12$DT!>d=b2N^r9T zAxV&f$p8ozK{_m&tiR|w-ip1RE5Q<0xtP<-o~w#nM-8(`0gO93!E%bpMYmvr1UhAr zqFJzX=TniRZ7*BIR)8LC6JToF0pn3}4%AxDrhGB|lpeW6Z5rb5Vj^u!@ zn`@FwFkuX5o(BOHSp0avrCZl(@u6CLyb-_P$1hZ8|9Rxp2cex<|AW+r=?CeLHSnh;PsPT5kZ(hJ3=8 zPi)HrSboplo7<2#3)MMa9{p18uJydJEuVP~M>WbKB{&wHk`Y+vpgnOHog|1((@4?jC}1jHa0Ve#\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 26 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 10910 non-null int64 \n", + " 1 Customer 10910 non-null object \n", + " 2 State 10279 non-null object \n", + " 3 Customer Lifetime Value 10910 non-null float64\n", + " 4 Response 10279 non-null object \n", + " 5 Coverage 10910 non-null object \n", + " 6 Education 10910 non-null object \n", + " 7 Effective To Date 10910 non-null object \n", + " 8 EmploymentStatus 10910 non-null object \n", + " 9 Gender 10910 non-null object \n", + " 10 Income 10910 non-null int64 \n", + " 11 Location Code 10910 non-null object \n", + " 12 Marital Status 10910 non-null object \n", + " 13 Monthly Premium Auto 10910 non-null int64 \n", + " 14 Months Since Last Claim 10277 non-null float64\n", + " 15 Months Since Policy Inception 10910 non-null int64 \n", + " 16 Number of Open Complaints 10277 non-null float64\n", + " 17 Number of Policies 10910 non-null int64 \n", + " 18 Policy Type 10910 non-null object \n", + " 19 Policy 10910 non-null object \n", + " 20 Renew Offer Type 10910 non-null object \n", + " 21 Sales Channel 10910 non-null object \n", + " 22 Total Claim Amount 10910 non-null float64\n", + " 23 Vehicle Class 10288 non-null object \n", + " 24 Vehicle Size 10288 non-null object \n", + " 25 Vehicle Type 5428 non-null object \n", + "dtypes: float64(4), int64(5), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "01e8fcc0-0ec9-4e67-be70-27af6398c807", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(columns=['Unnamed: 0'])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cb1be6a7-0f3c-46cd-beab-9ca3ce3cab77", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d", - "metadata": { - "id": "a8f08a52-bec0-439b-99cc-11d3809d8b5d" - }, - "source": [ - "In this challenge, we will continue to work with customer data from an insurance company. We will use the dataset called marketing_customer_analysis.csv, which can be found at the following link:\n", - "\n", - "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\n", - "\n", - "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by first performing data cleaning, formatting, and structuring." + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgenderincome...number_of_open_complaintsnumber_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_type
0DK49336Arizona4809.216960NoBasicCollege2/18/11EmployedM48029...0.09Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeNaN
1KX64629California2228.525238NoBasicCollege1/18/11UnemployedF0...0.01Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeNaN
2LZ68649Washington14947.917300NoBasicBachelor2/10/11EmployedM22139...0.02Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA
3XL78013Oregon22332.439460YesExtendedCollege1/11/11EmployedM49078...0.02Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA
4QA50777Oregon9025.067525NoPremiumBachelor1/17/11Medical LeaveF23675...NaN7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeNaN
..................................................................
10905FE99816Nevada15563.369440NoPremiumBachelor1/19/11UnemployedF0...NaN7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsizeA
10906KX53892Oregon5259.444853NoBasicCollege1/6/11EmployedF61146...0.06Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsizeA
10907TL39050Arizona23893.304100NoExtendedBachelor2/6/11EmployedF39837...0.02Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsizeNaN
10908WA60547California11971.977650NoPremiumCollege2/13/11EmployedF64195...4.06Personal AutoPersonal L1Offer1Branch618.288849SUVMedsizeA
10909IV32877NaN6857.519928NaNBasicBachelor1/8/11UnemployedM0...0.03Personal AutoPersonal L1Offer4Web1021.719397SUVMedsizeNaN
\n", + "

10910 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value response coverage \\\n", + "0 DK49336 Arizona 4809.216960 No Basic \n", + "1 KX64629 California 2228.525238 No Basic \n", + "2 LZ68649 Washington 14947.917300 No Basic \n", + "3 XL78013 Oregon 22332.439460 Yes Extended \n", + "4 QA50777 Oregon 9025.067525 No Premium \n", + "... ... ... ... ... ... \n", + "10905 FE99816 Nevada 15563.369440 No Premium \n", + "10906 KX53892 Oregon 5259.444853 No Basic \n", + "10907 TL39050 Arizona 23893.304100 No Extended \n", + "10908 WA60547 California 11971.977650 No Premium \n", + "10909 IV32877 NaN 6857.519928 NaN Basic \n", + "\n", + " education effective_to_date employmentstatus gender income ... \\\n", + "0 College 2/18/11 Employed M 48029 ... \n", + "1 College 1/18/11 Unemployed F 0 ... \n", + "2 Bachelor 2/10/11 Employed M 22139 ... \n", + "3 College 1/11/11 Employed M 49078 ... \n", + "4 Bachelor 1/17/11 Medical Leave F 23675 ... \n", + "... ... ... ... ... ... ... \n", + "10905 Bachelor 1/19/11 Unemployed F 0 ... \n", + "10906 College 1/6/11 Employed F 61146 ... \n", + "10907 Bachelor 2/6/11 Employed F 39837 ... \n", + "10908 College 2/13/11 Employed F 64195 ... \n", + "10909 Bachelor 1/8/11 Unemployed M 0 ... \n", + "\n", + " number_of_open_complaints number_of_policies policy_type \\\n", + "0 0.0 9 Corporate Auto \n", + "1 0.0 1 Personal Auto \n", + "2 0.0 2 Personal Auto \n", + "3 0.0 2 Corporate Auto \n", + "4 NaN 7 Personal Auto \n", + "... ... ... ... \n", + "10905 NaN 7 Personal Auto \n", + "10906 0.0 6 Personal Auto \n", + "10907 0.0 2 Corporate Auto \n", + "10908 4.0 6 Personal Auto \n", + "10909 0.0 3 Personal Auto \n", + "\n", + " policy renew_offer_type sales_channel total_claim_amount \\\n", + "0 Corporate L3 Offer3 Agent 292.800000 \n", + "1 Personal L3 Offer4 Call Center 744.924331 \n", + "2 Personal L3 Offer3 Call Center 480.000000 \n", + "3 Corporate L3 Offer2 Branch 484.013411 \n", + "4 Personal L2 Offer1 Branch 707.925645 \n", + "... ... ... ... ... \n", + "10905 Personal L1 Offer3 Web 1214.400000 \n", + "10906 Personal L3 Offer2 Branch 273.018929 \n", + "10907 Corporate L3 Offer1 Web 381.306996 \n", + "10908 Personal L1 Offer1 Branch 618.288849 \n", + "10909 Personal L1 Offer4 Web 1021.719397 \n", + "\n", + " vehicle_class vehicle_size vehicle_type \n", + "0 Four-Door Car Medsize NaN \n", + "1 Four-Door Car Medsize NaN \n", + "2 SUV Medsize A \n", + "3 Four-Door Car Medsize A \n", + "4 Four-Door Car Medsize NaN \n", + "... ... ... ... \n", + "10905 Luxury Car Medsize A \n", + "10906 Four-Door Car Medsize A \n", + "10907 Luxury SUV Medsize NaN \n", + "10908 SUV Medsize A \n", + "10909 SUV Medsize NaN \n", + "\n", + "[10910 rows x 25 columns]" ] - }, + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "11814331-8748-41ba-a054-464d92080595", + "metadata": {}, + "outputs": [], + "source": [ + "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'], format='%m/%d/%y')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "51018226-6dda-49f1-b4f3-491bdca531dd", + "metadata": {}, + "outputs": [], + "source": [ + "median_value = df['months_since_last_claim'].median()\n", + "\n", + "df['months_since_last_claim'] = df['months_since_last_claim'].fillna(median_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "693ccfc5-3c93-4f6d-9c2e-92692f12bd7d", + "metadata": {}, + "outputs": [], + "source": [ + "mode_value = df['state'].mode()[0]\n", + "\n", + "df['state'] = df['state'].fillna(mode_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0122860d-b438-4417-bdfb-c87681a3a4ff", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(columns=['vehicle_type'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0add733f-d2fc-4c6f-a4c2-b3265a7ff72c", + "metadata": {}, + "outputs": [], + "source": [ + "df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(df['number_of_open_complaints'].median())\n", + "df['response'] = df['response'].fillna(df['response'].mode()[0])\n", + "df['vehicle_class'] = df['vehicle_class'].fillna(df['vehicle_class'].mode()[0])\n", + "df['vehicle_size'] = df['vehicle_size'].fillna(df['vehicle_size'].mode()[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ae18e49b-a95f-42dc-8b18-ed790e2dc766", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50", - "metadata": { - "id": "9c98ddc5-b041-4c94-ada1-4dfee5c98e50" - }, - "source": [ - "1. Create a new DataFrame that only includes customers who:\n", - " - have a **low total_claim_amount** (e.g., below $1,000),\n", - " - have a response \"Yes\" to the last marketing campaign." + "data": { + "text/plain": [ + "customer 0\n", + "state 0\n", + "customer_lifetime_value 0\n", + "response 0\n", + "coverage 0\n", + "education 0\n", + "effective_to_date 0\n", + "employmentstatus 0\n", + "gender 0\n", + "income 0\n", + "location_code 0\n", + "marital_status 0\n", + "monthly_premium_auto 0\n", + "months_since_last_claim 0\n", + "months_since_policy_inception 0\n", + "number_of_open_complaints 0\n", + "number_of_policies 0\n", + "policy_type 0\n", + "policy 0\n", + "renew_offer_type 0\n", + "sales_channel 0\n", + "total_claim_amount 0\n", + "vehicle_class 0\n", + "vehicle_size 0\n", + "dtype: int64" ] - }, + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1b46795a-31a6-451d-95dc-97850251a278", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3", - "metadata": { - "id": "b9be383e-5165-436e-80c8-57d4c757c8c3" - }, - "source": [ - "2. Using the original Dataframe, analyze:\n", - " - the average `monthly_premium` and/or customer lifetime value by `policy_type` and `gender` for customers who responded \"Yes\", and\n", - " - compare these insights to `total_claim_amount` patterns, and discuss which segments appear most profitable or low-risk for the company." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "New DataFrame of low-risk, engaged customers:\n" + ] }, { - "cell_type": "markdown", - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0", - "metadata": { - "id": "7050f4ac-53c5-4193-a3c0-8699b87196f0" - }, - "source": [ - "3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers." + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgenderincome...months_since_policy_inceptionnumber_of_open_complaintsnumber_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_size
3XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM49078...30.02Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsize
8FM55990California5989.773931YesPremiumCollege2011-01-19EmployedM66839...240.01Personal AutoPersonal L1Offer2Branch739.200000Sports CarMedsize
15CW49887California4626.801093YesBasicMaster2011-01-16EmployedF79487...870.01Special AutoSpecial L1Offer2Branch547.200000SUVMedsize
19NJ54277California3746.751625YesExtendedCollege2011-02-26EmployedF41479...381.01Personal AutoPersonal L2Offer2Call Center19.575683Two-Door CarLarge
27MQ68407Oregon4376.363592YesPremiumBachelor2011-02-28EmployedF63774...630.01Personal AutoPersonal L3Offer2Agent60.036683Four-Door CarMedsize
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " customer state customer_lifetime_value response coverage education \\\n", + "3 XL78013 Oregon 22332.439460 Yes Extended College \n", + "8 FM55990 California 5989.773931 Yes Premium College \n", + "15 CW49887 California 4626.801093 Yes Basic Master \n", + "19 NJ54277 California 3746.751625 Yes Extended College \n", + "27 MQ68407 Oregon 4376.363592 Yes Premium Bachelor \n", + "\n", + " effective_to_date employmentstatus gender income ... \\\n", + "3 2011-01-11 Employed M 49078 ... \n", + "8 2011-01-19 Employed M 66839 ... \n", + "15 2011-01-16 Employed F 79487 ... \n", + "19 2011-02-26 Employed F 41479 ... \n", + "27 2011-02-28 Employed F 63774 ... \n", + "\n", + " months_since_policy_inception number_of_open_complaints \\\n", + "3 3 0.0 \n", + "8 24 0.0 \n", + "15 87 0.0 \n", + "19 38 1.0 \n", + "27 63 0.0 \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "8 1 Personal Auto Personal L1 Offer2 \n", + "15 1 Special Auto Special L1 Offer2 \n", + "19 1 Personal Auto Personal L2 Offer2 \n", + "27 1 Personal Auto Personal L3 Offer2 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "8 Branch 739.200000 Sports Car Medsize \n", + "15 Branch 547.200000 SUV Medsize \n", + "19 Call Center 19.575683 Two-Door Car Large \n", + "27 Agent 60.036683 Four-Door Car Medsize \n", + "\n", + "[5 rows x 24 columns]" ] - }, + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#1. Create a new DataFrame that only includes customers who:\n", + "# have a low total_claim_amount (e.g., below $1,000),\n", + "# have a response \"Yes\" to the last marketing campaign.\n", + "\n", + "# Create the two conditions\n", + "low_claim_condition = df['total_claim_amount'] < 1000\n", + "yes_response_condition = df['response'] == 'Yes'\n", + "\n", + "# Combine the conditions with '&' and apply them to the DataFrame\n", + "# Note the parentheses around each condition, which is important!\n", + "low_risk_customers_df = df[low_claim_condition & yes_response_condition]\n", + "\n", + "print(\"New DataFrame of low-risk, engaged customers:\")\n", + "low_risk_customers_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "71aae6dd-657f-4c93-b729-d2153e7d70ad", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d", - "metadata": { - "id": "b60a4443-a1a7-4bbf-b78e-9ccdf9895e0d" - }, - "source": [ - "4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Profitability Analysis:\n" + ] }, { - "cell_type": "markdown", - "id": "b42999f9-311f-481e-ae63-40a5577072c5", - "metadata": { - "id": "b42999f9-311f-481e-ae63-40a5577072c5" - }, - "source": [ - "## Bonus" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
average_monthly_premiumaverage_clvaverage_claim_amount
policy_typegender
Corporate AutoF94.307712.63433.74
M92.197944.47408.58
Personal AutoF99.008339.79452.97
M91.097448.38457.01
Special AutoF92.317691.58453.28
M86.348247.09429.53
\n", + "
" + ], + "text/plain": [ + " average_monthly_premium average_clv \\\n", + "policy_type gender \n", + "Corporate Auto F 94.30 7712.63 \n", + " M 92.19 7944.47 \n", + "Personal Auto F 99.00 8339.79 \n", + " M 91.09 7448.38 \n", + "Special Auto F 92.31 7691.58 \n", + " M 86.34 8247.09 \n", + "\n", + " average_claim_amount \n", + "policy_type gender \n", + "Corporate Auto F 433.74 \n", + " M 408.58 \n", + "Personal Auto F 452.97 \n", + " M 457.01 \n", + "Special Auto F 453.28 \n", + " M 429.53 " ] - }, + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#2. Using the original Dataframe, analyze:\n", + "# the average monthly_premium and/or customer lifetime value by policy_type and gender for customers who responded \"Yes\", and\n", + "# compare these insights to total_claim_amount patterns, and discuss which segments appear most profitable or low-risk for the company.\n", + "\n", + "# First, filter for customers who responded \"Yes\"\n", + "yes_customers = df[df['response'] == 'Yes']\n", + "\n", + "# Now, group by policy type and gender, and aggregate the key metrics\n", + "profitability_analysis = yes_customers.groupby(['policy_type', 'gender']).agg(\n", + " average_monthly_premium=('monthly_premium_auto', 'mean'),\n", + " average_clv=('customer_lifetime_value', 'mean'),\n", + " average_claim_amount=('total_claim_amount', 'mean')\n", + ")\n", + "\n", + "print(\"Profitability Analysis:\")\n", + "profitability_analysis.round(2) # Round for readability" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a54ef7b5-ae49-4b9a-a568-4c103b18ae76", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "81ff02c5-6584-4f21-a358-b918697c6432", - "metadata": { - "id": "81ff02c5-6584-4f21-a358-b918697c6432" - }, - "source": [ - "5. The marketing team wants to analyze the number of policies sold by state and month. Present the data in a table where the months are arranged as columns and the states are arranged as rows." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "States with over 500 customers:\n", + "state\n", + "California 4183\n", + "Oregon 2909\n", + "Arizona 1937\n", + "Nevada 993\n", + "Washington 888\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#3. Analyze the total number of customers who have policies in each state, and then filter the results to only include states where there are more than 500 customers.\n", + "\n", + "# Get the number of customers in each state\n", + "state_counts = df['state'].value_counts()\n", + "\n", + "# Filter the results to only include states with more than 500 customers\n", + "states_over_500 = state_counts[state_counts > 500]\n", + "\n", + "print(\"States with over 500 customers:\")\n", + "print(states_over_500)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f17ca56f-50bf-4ae2-bc61-b418e1b77f1e", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "b6aec097-c633-4017-a125-e77a97259cda", - "metadata": { - "id": "b6aec097-c633-4017-a125-e77a97259cda" - }, - "source": [ - "6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", - "\n", - "*Hint:*\n", - "- *To accomplish this, you will first need to group the data by state and month, then count the number of policies sold for each group. Afterwards, you will need to sort the data by the count of policies sold in descending order.*\n", - "- *Next, you will select the top 3 states with the highest number of policies sold.*\n", - "- *Finally, you will create a new DataFrame that contains the number of policies sold by month for each of the top 3 states.*" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "CLV Analysis by Education and Gender:\n" + ] }, { - "cell_type": "markdown", - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009", - "metadata": { - "id": "ba975b8a-a2cf-4fbf-9f59-ebc381767009" - }, - "source": [ - "7. The marketing team wants to analyze the effect of different marketing channels on the customer response rate.\n", - "\n", - "Hint: You can use melt to unpivot the data and create a table that shows the customer response rate (those who responded \"Yes\") by marketing channel." + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
maxminmedian
educationgender
BachelorF73225.961904.005640.51
M67907.271898.015548.03
CollegeF61850.191898.685623.61
M61134.681918.126005.85
DoctorF44856.112395.575332.46
M32677.342267.605577.67
High School or BelowF55277.452144.926039.55
M83325.381940.986286.73
MasterF51016.072417.785729.86
M50568.262272.315579.10
\n", + "
" + ], + "text/plain": [ + " max min median\n", + "education gender \n", + "Bachelor F 73225.96 1904.00 5640.51\n", + " M 67907.27 1898.01 5548.03\n", + "College F 61850.19 1898.68 5623.61\n", + " M 61134.68 1918.12 6005.85\n", + "Doctor F 44856.11 2395.57 5332.46\n", + " M 32677.34 2267.60 5577.67\n", + "High School or Below F 55277.45 2144.92 6039.55\n", + " M 83325.38 1940.98 6286.73\n", + "Master F 51016.07 2417.78 5729.86\n", + " M 50568.26 2272.31 5579.10" ] - }, + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#4. Find the maximum, minimum, and median customer lifetime value by education level and gender. Write your conclusions.\n", + "\n", + "clv_by_segment = df.groupby(['education', 'gender'])['customer_lifetime_value'].agg(['max', 'min', 'median'])\n", + "\n", + "print(\"CLV Analysis by Education and Gender:\")\n", + "clv_by_segment.round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ff100757-8f5a-4285-bbce-a86d23a5e067", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert the column to datetime objects\n", + "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n", + "\n", + "# Extract the month number and create a new 'month' column\n", + "df['month'] = df['effective_to_date'].dt.month" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1ee8cb70-082c-4cf9-b76a-64733ec71e9b", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d", - "metadata": { - "id": "e4378d94-48fb-4850-a802-b1bc8f427b2d" - }, - "source": [ - "External Resources for Data Filtering: https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Policies Sold by State and Month:\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "id": "449513f4-0459-46a0-a18d-9398d974c9ad", - "metadata": { - "id": "449513f4-0459-46a0-a18d-9398d974c9ad" - }, - "outputs": [], - "source": [ - "# your code goes here" + "data": { + "text/plain": [ + "month state \n", + "1 Arizona 3052\n", + " California 6666\n", + " Nevada 1493\n", + " Oregon 4697\n", + " Washington 1358\n", + "2 Arizona 2864\n", + " California 5901\n", + " Nevada 1278\n", + " Oregon 3969\n", + " Washington 1225\n", + "dtype: int64" ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" + ], + "source": [ + "#5. The marketing team wants to analyze the number of policies sold by state and month. \n", + "# Present the data in a table where the months are arranged as columns and the states are arranged as rows.\n", + "\n", + "# Create the pivot table\n", + "# We add fill_value=0 to make sure any state/month combo with no sales shows a 0 instead of NaN\n", + "policies_pivot = df.pivot_table(\n", + " index='state',\n", + " columns='month',\n", + " values='number_of_policies',\n", + " aggfunc='sum',\n", + " fill_value=0\n", + ")\n", + "\n", + "print(\"Policies Sold by State and Month:\")\n", + "policies_pivot.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "67445039-fb08-43a9-977b-25433aaaae2f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Top 3 States by Number of Policies Sold:\n", + "month 1 2\n", + "state \n", + "California 6666 5901\n", + "Oregon 4697 3969\n", + "Arizona 3052 2864\n" + ] } + ], + "source": [ + "#6. Display a new DataFrame that contains the number of policies sold by month, by state, for the top 3 states with the highest number of policies sold.\n", + "\n", + "# Create a 'total' column by summing the policies across the months (the columns)\n", + "policies_pivot['total_policies'] = policies_pivot.sum(axis=1)\n", + "\n", + "# Sort the table by this new 'total' column and get the index (the state names) of the top 3\n", + "top_3_states = policies_pivot.sort_values(by='total_policies', ascending=False).head(3).index\n", + "\n", + "# Use .loc to select only the rows for the top 3 states from our pivot table\n", + "top_3_df = policies_pivot.loc[top_3_states]\n", + "\n", + "# Drop the temporary 'total_policies' column for a clean final table\n", + "top_3_df = top_3_df.drop(columns=['total_policies'])\n", + "\n", + "\n", + "print(\"\\nTop 3 States by Number of Policies Sold:\")\n", + "print(top_3_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98aaa755-4214-4d70-9335-3b953ece995f", + "metadata": {}, + "outputs": [], + "source": [ + "😮‍💨" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 }