From 4354a27b5956309e7f034180361b6a0f0bf6d1e3 Mon Sep 17 00:00:00 2001 From: Marisa Oliveira <163911161+HR-Freak@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:58:07 +0000 Subject: [PATCH] Solved data structuring and combining --- .../cleaning_functions.cpython-313.pyc | Bin 0 -> 4316 bytes cleaning_functions.py | 94 ++ lab-dw-data-structuring-and-combining.ipynb | 975 +++++++++++++++++- 3 files changed, 1059 insertions(+), 10 deletions(-) create mode 100644 __pycache__/cleaning_functions.cpython-313.pyc create mode 100644 cleaning_functions.py diff --git a/__pycache__/cleaning_functions.cpython-313.pyc b/__pycache__/cleaning_functions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5583d724b5bc76f596cc71c7467d2f654b3c593f GIT binary patch literal 4316 zcmcInO;8)j6`mhyMk65!fq(^!051efAjHDjiPw%3ehgl(&6b%ZxR!D$gEW{4qZw6? zgt6CEE2>hpupEFYyR6CyR}Qf&TZxZ*$RURua@0fzsd`=It=e3C!%O9o$|0{uKL#N? zd)PGSbx-$uuYcb6UcWwcyPXsi^q+w8F9$_^M?NZrDK`!;LE|AMQX>5kg=nj05Gyhr z6tamdP`hXY%87QMyvPA{h&)h1bO3eUrh~#uqHH!smvxfL20s>lhkpX{kea9Fpsp=* zl~o)Wn6n@GC^yfR>$?2>Nq&(QnX_fx^ORa;fK>?9k7`3?McW81+RxJ(th0{_a4_nq?FV%u1V_MloB%{3$ncWengd)!tp!dIE;ls ze>#>*BSpI%mb7>{rYDvZ4Zb2VRhCF2x}4TxDMi2(dEv3cnF%PW47j?!?N|SVZ+4MB2DI;JBkM)1ZA;m@w4U8x;Fh0 zFi^!zJ&QA7bFlg_TUiBJ^8l<5YP+@edQzOoiw;pZCS@;6Io~Md5*tS7Bl9@R67Dxj zctrB&JvNWKBn9YfoWlai_|>qt{1 z%#S1Go~}vQxgZ%U0H2hu5n;lDnek6CvoMYYIi8M51XEa;kYX#csv`qCDoyNOlUEb~ zS~R9ghJm?j>Gd?a8)WY}#iWi>M$e>#?m0`n$J*ljuYz{WNk-~!Qby!>iEAMxiS6km z^dJ{>Avchc+aNDt@s}{IXlt;@cob+FP)*LFfpGo}NQU~>-@3cLcdg(bF#Q8xv3dVc z!9Qa9M+*LPrvF^t|FeSsqUpa_@K2cjiM)U6Ma!wZh5gtw@vmcM*G#_U$^lE+`ONyp zwF3v$+_Jl{H&|%uH=Fv4ji-v9wxXxw6>l5h4k=qRpP9(q+4$(dL$$SMJNNWLYuIcJ zKl$wItGVHseCrh${%qsp1CHwK%0|tOaG~R@*>Sem8YumPo}UsuokY*)rmgLufokw% zK14p)NPVCBH;nwhd`cvKV1vlV{!lLzxvE4x?HI{#Jft7p`Ld+Dl)IR z0qw#pZi}Ujl%9}L6re1pl!RQu0jB#gH4Ty3j^x|&I;IC-#$oc8B*Z|F?t{w0nT0HD zUsiPq!a35@TKojggmUau*U%`)^~2A&4+OyFaOPZnrlY^$7&0A0Ux|6gdoP5Bt>&GD zt%Y4NdwT!!=0eULs^90`YJFhe?#ed5;#r}~!dZ^*F>SqI`2K<1z%R|d@j~B>**Eid zI^TCSGf^6~0_{mqkceRgS{v-~H7GQC5e7=7t|4ue(bx4H2@f@8z8a>^lt-i>7>}-d zX)+aa%@N0~DT~BAty1ASYE;lzPsUR#qtU=w?X8T$#X`KLrj%jHXv{n_*HkkIpCU&N zPvky$Kv9w(cE!vAtu2uLP@{~UykZYS%dpO0lA2xr3^$o0n?FBz(+WFJqby~OX9Vdi|I0ok)(!$zzYFB z*npX29J5kV!FCd$4cw?0QP3*MG z$>+TA#oET&7xy>r7x+Gt@7os(e9+{B#ZdUqp{Jo|H~ttdgf5w(OMfx)p~+1q=lY51 z7$|yun?lj)-U)4m9*EnK%w*BsNPKzc*4C};*nVpBR?aI%5M#@=Mf z66&Sl<&ip(qmpb@y!|FiuITF_igHms;A^0J@CWIxb1_tOi#D>Zni<078A#f0hs3Dl z3jl0h(PC;k4ye_aej~?HmM=g8;x=RzK1+}`tV|w}ClKZma$J!#Y)j~IxdK`T=pzX= zWI&JkDvz)w2$u_KQe+i9;)MC}i=Bp6XsKx4L zkM$B9TIE=8)h&+oRsnsiw|e~@?X}*X^lXr`UTs#AYb7Iy#6SyG*7i>vld}{#$(s>< z3akt8Gl(@a)N`RTJCPT41U52A=j<`u+ 1: + mid = splits.iloc[:, 1] + else: + mid = splits.iloc[:, 0] + df["number_of_open_complaints"] = ( + pd.to_numeric(mid, errors="coerce") + .fillna(0) + .astype(int) + ) + return df + + + +def handle_nulls(df): + num_cols = df.select_dtypes(include=["number"]).columns + cat_cols = df.select_dtypes(include=["object"]).columns + + for col in num_cols: + df[col] = df[col].fillna(df[col].median()) + + for col in cat_cols: + df[col] = df[col].fillna(df[col].mode()[0]) + + return df + + +def remove_duplicates(df): + df = df.drop_duplicates(keep="first") + df = df.reset_index(drop=True) + return df + + +def clean_data(df): + df = clean_column_names(df) + df = clean_invalid_values(df) + df = clean_clv(df) + df = clean_open_complaints(df) + df = handle_nulls(df) + df = remove_duplicates(df) + return df diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..874bd78 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,192 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, "outputs": [], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "from cleaning_functions import clean_data" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b0e59683", + "metadata": {}, + "outputs": [], + "source": [ + "file1 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "file2 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "file3 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "# Load datasets\n", + "df1 = pd.read_csv(file1)\n", + "df2 = pd.read_csv(file2)\n", + "df3 = pd.read_csv(file3)\n", + "# Clean datasets\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a0bc80a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "file1 columns: ['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'Vehicle Class', 'Total Claim Amount']\n", + "file2 columns: ['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Total Claim Amount', 'Policy Type', 'Vehicle Class']\n", + "file3 columns: ['Customer', 'State', 'Customer Lifetime Value', 'Education', 'Gender', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Policy Type', 'Total Claim Amount', 'Vehicle Class']\n" + ] + } + ], + "source": [ + "print(\"file1 columns:\", df1.columns.tolist())\n", + "print(\"file2 columns:\", df2.columns.tolist())\n", + "print(\"file3 columns:\", df3.columns.tolist()) \n", + "# Check column names for consistency" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4e267477", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "file1: ['1/0/00' '1/2/00' '1/1/00' '1/3/00' '1/5/00' '1/4/00' nan]\n", + "file2: ['1/0/00' '1/1/00' '1/3/00' '1/5/00' '1/2/00' '1/4/00']\n", + "file3: [0 2 3 1 5 4]\n" + ] + } + ], + "source": [ + "print(\"file1:\", df1[\"Number of Open Complaints\"].unique()[:10])\n", + "print(\"file2:\", df2[\"Number of Open Complaints\"].unique()[:10])\n", + "print(\"file3:\", df3[\"Number of Open Complaints\"].unique()[:10])\n", + "# Check unique values in 'Number of Open Complaints' column" + ] + }, + { + "cell_type": "markdown", + "id": "4f23695b", + "metadata": {}, + "source": [ + "Went straight to this colunm because I remebered that changed this in the last lab and got a message error when I run a \"clean_data\" for all the files" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "90f48bb2", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_open_complaints(df):\n", + " # Column name *after* clean_column_names()\n", + " col = df[\"number_of_open_complaints\"].astype(str)\n", + " # Split on '/' because we have values like '1/0/00'\n", + " splits = col.str.split(\"/\", expand=True)\n", + " # If we have format X/Y/Z, use middle (Y). If it's already a single number, use that.\n", + " if splits.shape[1] >= 2:\n", + " values = splits[1]\n", + " else:\n", + " values = splits[0]\n", + " df[\"number_of_open_complaints\"] = (\n", + " pd.to_numeric(values, errors=\"coerce\")\n", + " .fillna(0)\n", + " .astype(int)\n", + " )\n", + " return df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e310f5d1", + "metadata": {}, + "outputs": [], + "source": [ + "import importlib\n", + "import cleaning_functions\n", + "importlib.reload(cleaning_functions)\n", + "from cleaning_functions import clean_data" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "1e268c8f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "41be0e2c", + "metadata": {}, + "outputs": [], + "source": [ + "file1 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "file2 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "file3 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "\n", + "df1 = pd.read_csv(file1)\n", + "df2 = pd.read_csv(file2)\n", + "df3 = pd.read_csv(file3)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e06995da", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9138 entries, 0 to 9137\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 customer 9138 non-null object \n", + " 1 state 9138 non-null object \n", + " 2 gender 9138 non-null object \n", + " 3 education 9138 non-null object \n", + " 4 customer_lifetime_value 9138 non-null float64\n", + " 5 income 9138 non-null float64\n", + " 6 monthly_premium_auto 9138 non-null float64\n", + " 7 number_of_open_complaints 9138 non-null int64 \n", + " 8 policy_type 9138 non-null object \n", + " 9 vehicle_class 9138 non-null object \n", + " 10 total_claim_amount 9138 non-null float64\n", + "dtypes: float64(4), int64(1), object(6)\n", + "memory usage: 785.4+ KB\n" + ] + } + ], + "source": [ + "df1_cleaned = clean_data(df1)\n", + "df2_cleaned = clean_data(df2)\n", + "df3_cleaned = clean_data(df3)\n", + "\n", + "df_combined = pd.concat([df1_cleaned, df2_cleaned, df3_cleaned], ignore_index=True)\n", + "\n", + "df_combined.info()" ] }, { @@ -72,14 +250,230 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "df.head()" ] }, { @@ -93,6 +487,127 @@ "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 38, + "id": "285a4645", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "63a444f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
sales_channel
Agent8021.81
Branch8060.62
Call Center8110.36
Web7809.12
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value\n", + "sales_channel \n", + "Agent 8021.81\n", + "Branch 8060.62\n", + "Call Center 8110.36\n", + "Web 7809.12" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_sales = (\n", + " df.pivot_table(\n", + " index=\"sales_channel\",\n", + " values=\"customer_lifetime_value\",\n", + " aggfunc=\"mean\" \n", + ").round(2)\n", + ")\n", + "pivot_sales" + ] + }, + { + "cell_type": "markdown", + "id": "bb1b4788", + "metadata": {}, + "source": [ + "Actually, to do this analyse we would need to have a premium colunm, but considering that we don't, I used the CLV that is the closest data we have to make this analyses.\n", + "These are the insights:\n", + "- Call Center generated the highest revenue, meaning that customers who adquire through this channel contribute the most long term revenue\n", + "- Branch and Agent are in the middle,, indicating moderate long term revenue\n", + "- The lowest revenue channel is Web, it brings in customers with lower long term value revenue" + ] + }, { "cell_type": "markdown", "id": "640993b2-a291-436c-a34d-a551144f8196", @@ -103,6 +618,125 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 45, + "id": "8e34dde5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMasterTotal
gender
F13386258.1112460107.911465701.7813793600.123874600.2544980268.17
M12110061.8312851725.021490482.0612680914.193365559.0642498742.16
Total25496319.9525311832.932956183.8426474514.317240159.3087479010.33
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below \\\n", + "gender \n", + "F 13386258.11 12460107.91 1465701.78 13793600.12 \n", + "M 12110061.83 12851725.02 1490482.06 12680914.19 \n", + "Total 25496319.95 25311832.93 2956183.84 26474514.31 \n", + "\n", + "education Master Total \n", + "gender \n", + "F 3874600.25 44980268.17 \n", + "M 3365559.06 42498742.16 \n", + "Total 7240159.30 87479010.33 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_clv = (\n", + " df.pivot_table(\n", + " index=\"gender\",\n", + " columns=\"education\",\n", + " values=\"customer_lifetime_value\",\n", + " aggfunc=\"sum\",\n", + " margins=True,\n", + " margins_name=\"Total\",\n", + " ).round(2)\n", + ")\n", + "pivot_clv" + ] + }, + { + "cell_type": "markdown", + "id": "f3745556", + "metadata": {}, + "source": [ + "- We have more customers female than male\n", + "- The majority of the customers have an education level of high school or below \n", + "- Comparing gender and education, we can say that the majority of the customers are female with high school or below " + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,14 +764,335 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...policy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonthefective_to_data
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeAFebruary2011-02-18
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeAJanuary2011-01-18
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeAFebruary2011-02-10
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeAJanuary2011-01-11
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeAJanuary2011-01-17
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " policy_type policy renew_offer_type sales_channel \\\n", + "0 Corporate Auto Corporate L3 Offer3 Agent \n", + "1 Personal Auto Personal L3 Offer4 Call Center \n", + "2 Personal Auto Personal L3 Offer3 Call Center \n", + "3 Corporate Auto Corporate L3 Offer2 Branch \n", + "4 Personal Auto Personal L2 Offer1 Branch \n", + "\n", + " total_claim_amount vehicle_class vehicle_size vehicle_type month \\\n", + "0 292.800000 Four-Door Car Medsize A February \n", + "1 744.924331 Four-Door Car Medsize A January \n", + "2 480.000000 SUV Medsize A February \n", + "3 484.013411 Four-Door Car Medsize A January \n", + "4 707.925645 Four-Door Car Medsize A January \n", + "\n", + " efective_to_data \n", + "0 2011-02-18 \n", + "1 2011-01-18 \n", + "2 2011-02-10 \n", + "3 2011-01-11 \n", + "4 2011-01-17 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"efective_to_data\"] = pd.to_datetime(df[\"effective_to_date\"])\n", + "df[\"month\"] = df[\"efective_to_data\"].dt.month_name()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "453ac779", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typemonthnumber_of_open_complaints
0Corporate AutoFebruary385.208135
1Corporate AutoJanuary443.434952
2Personal AutoFebruary1453.684441
3Personal AutoJanuary1727.605722
4Special AutoFebruary95.226817
5Special AutoJanuary87.074049
\n", + "
" + ], + "text/plain": [ + " policy_type month number_of_open_complaints\n", + "0 Corporate Auto February 385.208135\n", + "1 Corporate Auto January 443.434952\n", + "2 Personal Auto February 1453.684441\n", + "3 Personal Auto January 1727.605722\n", + "4 Special Auto February 95.226817\n", + "5 Special Auto January 87.074049" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complaints_summary = (\n", + " df.groupby([\"policy_type\", \"month\"])[\"number_of_open_complaints\"]\n", + " .sum()\n", + " .reset_index()\n", + ")\n", + "complaints_summary" + ] + }, + { + "cell_type": "markdown", + "id": "6ac5ee88", + "metadata": {}, "source": [ - "# Your code goes here" + "The month with the highest number of complaints is January for Personal Auto policy type and Corporate Auto; for Special Auto is February " ] } ], @@ -146,7 +1101,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +1115,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,