From 831295e55a04f040bbf86a387a07d6f53be6e61b Mon Sep 17 00:00:00 2001 From: Lucie Lopez Date: Sun, 3 Aug 2025 19:42:20 +0200 Subject: [PATCH] Lab OK --- lab-dw-data-structuring-and-combining.ipynb | 584 +++++++++++++++++++- 1 file changed, 574 insertions(+), 10 deletions(-) diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..829b4f8 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -34,16 +34,128 @@ "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb0e433d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "#Cleaned file\n", + "df1 = pd.read_csv(\"cleaned_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "077e47aa", + "metadata": {}, + "outputs": [], + "source": [ + "#2 others\n", + "\n", + "url2 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n", + "\n", + "# Combine for cleaning\n", + "df_new = pd.concat([df2, df3], ignore_index=True)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "492d06e3-92c7-4105-ac72-536db98d3244", - "metadata": { - "id": "492d06e3-92c7-4105-ac72-536db98d3244" - }, + "id": "dc7b2188", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "# Clean\n", + "\n", + "df_new.columns = [col.lower() for col in df_new.columns]\n", + "df_new.columns = [col.replace(\" \", \"_\") for col in df_new.columns]\n", + "\n", + "df_new = df_new.rename(columns={\"st\":\"state\"})\n", + "\n", + "# Standardize Gender\n", + "df_new[\"gender\"] = df_new[\"gender\"].str.strip().str.upper().map({\n", + " 'F': 'F', 'FEMALE': 'F', 'FEMAL': 'F',\n", + " 'M': 'M', 'MALE': 'M'\n", + "})\n", + "\n", + "# State Abbreviations\n", + "state_map = {'AZ': 'Arizona', 'Cali': 'California', 'WA': 'Washington'}\n", + "df_new['state'] = df_new['state'].replace(state_map)\n", + "\n", + "# Education\n", + "df_new['education'] = df_new['education'].replace({'Bachelors': 'Bachelor'})\n", + "\n", + "# Customer Lifetime Value\n", + "df_new['customer_lifetime_value'] = (\n", + " df_new['customer_lifetime_value']\n", + " .astype(str)\n", + " .str.replace('%', '', regex=False)\n", + " .astype(float)\n", + ")\n", + "\n", + "# Vehicle Class\n", + "df_new['vehicle_class'] = df_new['vehicle_class'].replace({\n", + " 'Sports Car': 'Luxury',\n", + " 'Luxury SUV': 'Luxury',\n", + " 'Luxury Car': 'Luxury'\n", + "})\n", + "\n", + "# Number of Open Complaints\n", + "df_new['number_of_open_complaints'] = (\n", + " df_new['number_of_open_complaints']\n", + " .astype(str)\n", + " .apply(lambda x: int(x.split('/')[1]) if '/' in x else pd.NA)\n", + " .astype('Int64')\n", + ")\n", + "\n", + "# Null Values\n", + "num_cols = df_new.select_dtypes(include=['float', 'int', 'Int64']).columns\n", + "cat_cols = df_new.select_dtypes(include=['object', 'category']).columns\n", + "\n", + "for col in num_cols:\n", + " df_new[col] = df_new[col].fillna(df_new[col].median())\n", + "\n", + "for col in cat_cols:\n", + " df_new[col] = df_new[col].fillna(df_new[col].mode()[0])\n", + "\n", + "# Convert numerics to int\n", + "for col in num_cols:\n", + " df_new[col] = df_new[col].astype(int)\n", + "\n", + "# Drop Duplicates\n", + "df_new = df_new.drop_duplicates().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fca3be4", + "metadata": {}, + "outputs": [], + "source": [ + "# Combine with cleaned df1\n", + "df_combined = pd.concat([df1, df_new], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77a456eb", + "metadata": {}, + "outputs": [], + "source": [ + "df_combined = df_combined.drop_duplicates().reset_index(drop=True)\n", + "\n", + "# Save cleaned and combined dataset\n", + "df_combined.to_csv(\"final_cleaned_data.csv\", index=False)" ] }, { @@ -72,14 +184,404 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
..................................................................
1090510905FE99816Nevada15563.369440NoPremiumBachelor2011-01-19UnemployedF...7Personal AutoPersonal L1Offer3Web1214.400000Luxury CarMedsizeA1
1090610906KX53892Oregon5259.444853NoBasicCollege2011-01-06EmployedF...6Personal AutoPersonal L3Offer2Branch273.018929Four-Door CarMedsizeA1
1090710907TL39050Arizona23893.304100NoExtendedBachelor2011-02-06EmployedF...2Corporate AutoCorporate L3Offer1Web381.306996Luxury SUVMedsizeA2
1090810908WA60547California11971.977650NoPremiumCollege2011-02-13EmployedF...6Personal AutoPersonal L1Offer1Branch618.288849SUVMedsizeA2
1090910909IV32877California6857.519928NoBasicBachelor2011-01-08UnemployedM...3Personal AutoPersonal L1Offer4Web1021.719397SUVMedsizeA1
\n", + "

10910 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "... ... ... ... ... ... \n", + "10905 10905 FE99816 Nevada 15563.369440 No \n", + "10906 10906 KX53892 Oregon 5259.444853 No \n", + "10907 10907 TL39050 Arizona 23893.304100 No \n", + "10908 10908 WA60547 California 11971.977650 No \n", + "10909 10909 IV32877 California 6857.519928 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "... ... ... ... ... ... ... \n", + "10905 Premium Bachelor 2011-01-19 Unemployed F ... \n", + "10906 Basic College 2011-01-06 Employed F ... \n", + "10907 Extended Bachelor 2011-02-06 Employed F ... \n", + "10908 Premium College 2011-02-13 Employed F ... \n", + "10909 Basic Bachelor 2011-01-08 Unemployed M ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "... ... ... ... ... \n", + "10905 7 Personal Auto Personal L1 Offer3 \n", + "10906 6 Personal Auto Personal L3 Offer2 \n", + "10907 2 Corporate Auto Corporate L3 Offer1 \n", + "10908 6 Personal Auto Personal L1 Offer1 \n", + "10909 3 Personal Auto Personal L1 Offer4 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "... ... ... ... ... \n", + "10905 Web 1214.400000 Luxury Car Medsize \n", + "10906 Branch 273.018929 Four-Door Car Medsize \n", + "10907 Web 381.306996 Luxury SUV Medsize \n", + "10908 Branch 618.288849 SUV Medsize \n", + "10909 Web 1021.719397 SUV Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "... ... ... \n", + "10905 A 1 \n", + "10906 A 1 \n", + "10907 A 2 \n", + "10908 A 2 \n", + "10909 A 1 \n", + "\n", + "[10910 rows x 27 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)\n", + "df" ] }, { @@ -93,6 +595,38 @@ "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9e176db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04\n" + ] + } + ], + "source": [ + "# 1\n", + "revenue_by_channel = df.pivot_table(\n", + " index='sales_channel',\n", + " values='total_claim_amount',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "revenue_by_channel = revenue_by_channel.sort_values('total_claim_amount', ascending=False)\n", + "\n", + "print(revenue_by_channel)\n" + ] + }, { "cell_type": "markdown", "id": "640993b2-a291-436c-a34d-a551144f8196", @@ -103,6 +637,36 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9193baa3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83\n" + ] + } + ], + "source": [ + "#2 \n", + "clv_pivot = pd.pivot_table(\n", + " df,\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "print(clv_pivot)" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -146,7 +710,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +724,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.2" } }, "nbformat": 4,