From a8a051960c5ba10ea5092d4ea3d7fc8d509e0c6f Mon Sep 17 00:00:00 2001 From: Antonio Gouveia Date: Wed, 24 Sep 2025 19:38:21 +0100 Subject: [PATCH 1/2] solved lab --- data_cleaning.py | 127 +++ lab-dw-data-structuring-and-combining.ipynb | 848 +++++++++++++++++++- 2 files changed, 965 insertions(+), 10 deletions(-) create mode 100644 data_cleaning.py diff --git a/data_cleaning.py b/data_cleaning.py new file mode 100644 index 0000000..0f9a4e3 --- /dev/null +++ b/data_cleaning.py @@ -0,0 +1,127 @@ +# data_cleaning.py + +import pandas as pd +import numpy as np + +def clean_column_names(df: pd.DataFrame) -> pd.DataFrame: + """ + Standardizes the column names of a DataFrame. + """ + df.columns = df.columns.str.lower() + df.columns = df.columns.str.replace(' ', '_') + df = df.rename(columns={'st': 'state', 'income': 'customer_income'}) + return df + +def standardize_gender(df): + if 'gender' in df.columns: + # Garantir que estamos a trabalhar com uma Series + gender_series = df['gender'] + + # Converter para string e remover nulos + gender_series = gender_series.astype(str).fillna('') + + # Padronizar valores + df['gender'] = gender_series.apply(lambda x: 'F' if x.strip().upper().startswith('F') else + 'M' if x.strip().upper().startswith('M') else x) + return df + + +def standardize_state(df: pd.DataFrame) -> pd.DataFrame: + """ + Standardizes the 'state' column to full state names. + """ + if 'state' in df.columns: + state_mapping = { + 'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS', 'CA': 'CALIFORNIA', 'CALI': 'CALIFORNIA', + 'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'FL': 'FLORIDA', 'GA': 'GEORGIA', + 'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA', + 'KS': 'KANSAS', 'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND', + 'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI', 'MO': 'MISSOURI', + 'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY', + 'NM': 'NEW MEXICO', 'NY': 'NEW YORK', 'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO', + 'OK': 'OKLAHOMA', 'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA', + 'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT', + 'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING' + } + df['state'] = df['state'].astype(str).str.upper().replace(state_mapping) + return df + +def standardize_education(df: pd.DataFrame) -> pd.DataFrame: + """ + Standardizes the 'education' column. + """ + if 'education' in df.columns: + df['education'] = df['education'].astype(str) + df.loc[df['education'].str.contains(r'^[Bb]', na=False), 'education'] = 'Bachelor' + return df + +def standardize_vehicle_class(df: pd.DataFrame) -> pd.DataFrame: + """ + Standardizes the 'vehicle_class' column by grouping similar values. + """ + if 'vehicle_class' in df.columns: + df['vehicle_class'] = df['vehicle_class'].astype(str) + df.loc[df['vehicle_class'].str.contains(r'^[Lu]', na=False), 'vehicle_class'] = 'Luxury' + df.loc[df['vehicle_class'].str.contains(r'\bSports\b', na=False), 'vehicle_class'] = 'Luxury' + return df + +def clean_and_convert_numerical(df: pd.DataFrame) -> pd.DataFrame: + """ + Cleans and converts numerical columns. + """ + if 'customer_lifetime_value' in df.columns: + df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(str).str.rstrip('%') + df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce').astype('float64') + + if 'number_of_open_complaints' in df.columns: + df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'/(\d+)/') + df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').astype('Int64') + + return df + +def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """ + Removes duplicate rows from the DataFrame, keeping only the first occurrence. + """ + initial_rows = len(df) + df_cleaned = df.drop_duplicates(keep='first').reset_index(drop=True) + rows_removed = initial_rows - len(df_cleaned) + print(f"Number of duplicate rows removed: {rows_removed}") + return df_cleaned + +def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame: + """ + Fills in missing values in both categorical and numerical columns. + """ + categorical_cols = ['customer', 'state', 'gender', 'education', 'policy_type', 'vehicle_class'] + for col in categorical_cols: + if col in df.columns: + df[col] = df[col].fillna('Unknown') + + numerical_cols = ['total_claim_amount', 'monthly_premium_auto', 'customer_income', 'customer_lifetime_value'] + for col in numerical_cols: + if col in df.columns and pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].fillna(df[col].median()) + + if 'number_of_open_complaints' in df.columns: + df['complaints_missing'] = df['number_of_open_complaints'].isnull() + + return df + +def main_cleaning_pipeline(df: pd.DataFrame) -> pd.DataFrame: + """ + Main function to execute the complete data cleaning and formatting pipeline. + """ + print("Starting the data cleaning and formatting pipeline...") + + df = clean_column_names(df) + df = standardize_gender(df) + df = standardize_state(df) + df = standardize_education(df) + df = standardize_vehicle_class(df) + df = clean_and_convert_numerical(df) + df = handle_missing_values(df) + df = remove_duplicates(df) + + print("Data cleaning pipeline completed successfully.") + return df \ No newline at end of file diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..66876b3 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,264 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, "outputs": [], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "88ca076a-1c83-4566-bd61-e9dadfa09d6d", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n", + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "60bfb74e-d199-4f19-b554-9dd27f852d83", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable autoreload\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c3ab77d2-1e35-47f8-9274-b789cc92968c", + "metadata": {}, + "outputs": [], + "source": [ + "# Import cleaning module\n", + "import data_cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "29099c40-9ca7-4c8b-b3b1-eb9558882156", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting the data cleaning and formatting pipeline...\n", + "Number of duplicate rows removed: 2936\n", + "Data cleaning pipeline completed successfully.\n" + ] + } + ], + "source": [ + "df1_cleaned = data_cleaning.main_cleaning_pipeline(df1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1d4f3233-e292-45d0-b320-103611ba632a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting the data cleaning and formatting pipeline...\n", + "Number of duplicate rows removed: 0\n", + "Data cleaning pipeline completed successfully.\n" + ] + } + ], + "source": [ + "df2_cleaned = data_cleaning.main_cleaning_pipeline(df2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "92363654-4a71-4f39-ac55-22f4ec286278", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting the data cleaning and formatting pipeline...\n", + "Number of duplicate rows removed: 0\n", + "Data cleaning pipeline completed successfully.\n" + ] + } + ], + "source": [ + "df3_cleaned = data_cleaning.main_cleaning_pipeline(df3)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "40584279-b21c-4974-ae7d-3e89e738843b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valuecustomer_incomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountcomplaints_missing
0RB50392WASHINGTONnanMaster588174.2350.01000.00Personal AutoFour-Door Car2.704934False
1QZ44356ARIZONAFBachelor697953.5900.094.00Personal AutoFour-Door Car1131.464935False
2AI49188NEVADAFBachelor1288743.17048767.0108.00Personal AutoTwo-Door Car566.472247False
3WW63253CALIFORNIAMBachelor764586.1800.0106.00Corporate AutoSUV529.881344False
4GA49547WASHINGTONMHigh School or Below536307.65036357.068.00Personal AutoFour-Door Car17.269323False
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 WASHINGTON nan Master 588174.235 \n", + "1 QZ44356 ARIZONA F Bachelor 697953.590 \n", + "2 AI49188 NEVADA F Bachelor 1288743.170 \n", + "3 WW63253 CALIFORNIA M Bachelor 764586.180 \n", + "4 GA49547 WASHINGTON M High School or Below 536307.650 \n", + "\n", + " customer_income monthly_premium_auto number_of_open_complaints \\\n", + "0 0.0 1000.0 0 \n", + "1 0.0 94.0 0 \n", + "2 48767.0 108.0 0 \n", + "3 0.0 106.0 0 \n", + "4 36357.0 68.0 0 \n", + "\n", + " policy_type vehicle_class total_claim_amount complaints_missing \n", + "0 Personal Auto Four-Door Car 2.704934 False \n", + "1 Personal Auto Four-Door Car 1131.464935 False \n", + "2 Personal Auto Two-Door Car 566.472247 False \n", + "3 Corporate Auto SUV 529.881344 False \n", + "4 Personal Auto Four-Door Car 17.269323 False " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cleaned = pd.concat([df1_cleaned, df2_cleaned, df3_cleaned], ignore_index=True)\n", + "df_cleaned.head()" ] }, { @@ -72,14 +322,229 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "df.head()" ] }, { @@ -103,6 +568,87 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6f3b98b9-4284-45bb-9838-6c95b98b6c46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10910 entries, 0 to 10909\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 unnamed:_0 10910 non-null int64 \n", + " 1 customer 10910 non-null object \n", + " 2 state 10910 non-null object \n", + " 3 customer_lifetime_value 10910 non-null float64\n", + " 4 response 10910 non-null object \n", + " 5 coverage 10910 non-null object \n", + " 6 education 10910 non-null object \n", + " 7 effective_to_date 10910 non-null object \n", + " 8 employmentstatus 10910 non-null object \n", + " 9 gender 10910 non-null object \n", + " 10 income 10910 non-null int64 \n", + " 11 location_code 10910 non-null object \n", + " 12 marital_status 10910 non-null object \n", + " 13 monthly_premium_auto 10910 non-null int64 \n", + " 14 months_since_last_claim 10910 non-null float64\n", + " 15 months_since_policy_inception 10910 non-null int64 \n", + " 16 number_of_open_complaints 10910 non-null float64\n", + " 17 number_of_policies 10910 non-null int64 \n", + " 18 policy_type 10910 non-null object \n", + " 19 policy 10910 non-null object \n", + " 20 renew_offer_type 10910 non-null object \n", + " 21 sales_channel 10910 non-null object \n", + " 22 total_claim_amount 10910 non-null float64\n", + " 23 vehicle_class 10910 non-null object \n", + " 24 vehicle_size 10910 non-null object \n", + " 25 vehicle_type 10910 non-null object \n", + " 26 month 10910 non-null int64 \n", + "dtypes: float64(4), int64(6), object(17)\n", + "memory usage: 2.2+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f8beddc0-36fc-47ad-ab92-dacc6e41f695", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04\n" + ] + } + ], + "source": [ + "# Summary table showing the total revenue for each sales channel\n", + "pivot_revenue = df.pivot_table(\n", + " values='total_claim_amount',\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ").round(2)\n", + "\n", + "print(pivot_revenue)" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,15 +676,297 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typenumber_of_open_complaintsmonth
0Corporate Auto0.0000002
1Personal Auto0.0000001
2Personal Auto0.0000002
3Corporate Auto0.0000001
4Personal Auto0.3842561
\n", + "
" + ], + "text/plain": [ + " policy_type number_of_open_complaints month\n", + "0 Corporate Auto 0.000000 2\n", + "1 Personal Auto 0.000000 1\n", + "2 Personal Auto 0.000000 2\n", + "3 Corporate Auto 0.000000 1\n", + "4 Personal Auto 0.384256 1" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['policy_type', 'number_of_open_complaints', 'month']].head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2a125542-e6f6-4be5-bae4-941ec3f9af7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "month\n", + "1 5818\n", + "2 5092\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.month.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7558edbf-7d65-47d3-a17a-43528257af6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "number_of_open_complaints\n", + "0.000000 8160\n", + "1.000000 1145\n", + "0.384256 633\n", + "2.000000 414\n", + "3.000000 324\n", + "4.000000 166\n", + "5.000000 68\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.number_of_open_complaints.value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b768d078-f660-4e68-8f25-dfe69990eb1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0. , 0.38425611, 3. , 1. , 2. ,\n", + " 4. , 5. ])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['number_of_open_complaints'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b4722727-e0bd-4f45-a816-def7f4898a3d", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "\n", + "df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'^(\\d)').astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "fef4dd7a-2df7-4294-ac98-4def3efb9775", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "number_of_open_complaints\n", + "0.0 8793\n", + "1.0 1145\n", + "2.0 414\n", + "3.0 324\n", + "4.0 166\n", + "5.0 68\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['number_of_open_complaints'].value_counts().sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "0df38b71-2ac3-48c9-8cc9-60107b2e7cbf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typenumber_of_open_complaintsmonth
0Corporate Auto0.02
1Personal Auto0.01
2Personal Auto0.02
3Corporate Auto0.01
4Personal Auto0.01
\n", + "
" + ], + "text/plain": [ + " policy_type number_of_open_complaints month\n", + "0 Corporate Auto 0.0 2\n", + "1 Personal Auto 0.0 1\n", + "2 Personal Auto 0.0 2\n", + "3 Corporate Auto 0.0 1\n", + "4 Personal Auto 0.0 1" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['policy_type', 'number_of_open_complaints', 'month']].head()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77303f92-45bc-4f46-9e0f-6e9eb5c0372b", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -146,9 +974,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -160,7 +988,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4, From 40a65dd52969ee61515d5971178ac7ccbd13c3ae Mon Sep 17 00:00:00 2001 From: Antonio Gouveia Date: Wed, 24 Sep 2025 19:48:29 +0100 Subject: [PATCH 2/2] correction of solved lab --- lab-dw-data-structuring-and-combining.ipynb | 248 ++++++++++++++++++-- 1 file changed, 223 insertions(+), 25 deletions(-) diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index 66876b3..5dd7049 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "60bfb74e-d199-4f19-b554-9dd27f852d83", "metadata": {}, "outputs": [], @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "c3ab77d2-1e35-47f8-9274-b789cc92968c", "metadata": {}, "outputs": [], @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "29099c40-9ca7-4c8b-b3b1-eb9558882156", "metadata": {}, "outputs": [ @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "1d4f3233-e292-45d0-b320-103611ba632a", "metadata": {}, "outputs": [ @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "92363654-4a71-4f39-ac55-22f4ec286278", "metadata": {}, "outputs": [ @@ -144,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "40584279-b21c-4974-ae7d-3e89e738843b", "metadata": {}, "outputs": [ @@ -286,7 +286,7 @@ "4 Personal Auto Four-Door Car 17.269323 False " ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -322,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" @@ -537,7 +537,7 @@ "[5 rows x 27 columns]" ] }, - "execution_count": 14, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -570,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "6f3b98b9-4284-45bb-9838-6c95b98b6c46", "metadata": {}, "outputs": [ @@ -621,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "id": "f8beddc0-36fc-47ad-ab92-dacc6e41f695", "metadata": {}, "outputs": [ @@ -649,6 +649,92 @@ "print(pivot_revenue)" ] }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9006dfc7-e010-43ef-8bf6-9c219e12ddb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMaster
gender
F7874.277748.827328.518675.228157.05
M7703.608052.467415.338149.698168.83
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_clv = df.pivot_table(\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ").round(2)\n", + "\n", + "pivot_clv" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -676,7 +762,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" @@ -752,7 +838,7 @@ "4 Personal Auto 0.384256 1" ] }, - "execution_count": 19, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -763,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "id": "2a125542-e6f6-4be5-bae4-941ec3f9af7e", "metadata": {}, "outputs": [ @@ -776,7 +862,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 22, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -787,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "id": "7558edbf-7d65-47d3-a17a-43528257af6b", "metadata": {}, "outputs": [ @@ -805,7 +891,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -816,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "id": "b768d078-f660-4e68-8f25-dfe69990eb1a", "metadata": {}, "outputs": [ @@ -827,7 +913,7 @@ " 4. , 5. ])" ] }, - "execution_count": 26, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -838,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 19, "id": "b4722727-e0bd-4f45-a816-def7f4898a3d", "metadata": {}, "outputs": [], @@ -849,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "id": "fef4dd7a-2df7-4294-ac98-4def3efb9775", "metadata": {}, "outputs": [ @@ -866,7 +952,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 29, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -877,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 21, "id": "0df38b71-2ac3-48c9-8cc9-60107b2e7cbf", "metadata": {}, "outputs": [ @@ -951,7 +1037,7 @@ "4 Personal Auto 0.0 1" ] }, - "execution_count": 30, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -962,10 +1048,122 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "77303f92-45bc-4f46-9e0f-6e9eb5c0372b", "metadata": {}, "outputs": [], + "source": [ + "df_complaints = df[df['number_of_open_complaints'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f3617030-52d8-43f8-807d-7a8bce40c8c9", + "metadata": {}, + "outputs": [], + "source": [ + "complaints_summary = df_complaints.groupby(['policy_type', 'month']).size().reset_index(name='Num Complaints')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a2bc7621-a1b1-4c86-ba78-a552073de300", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typemonthNum Complaints
2Personal Auto14329
3Personal Auto23799
0Corporate Auto11252
1Corporate Auto21089
4Special Auto1237
5Special Auto2204
\n", + "
" + ], + "text/plain": [ + " policy_type month Num Complaints\n", + "2 Personal Auto 1 4329\n", + "3 Personal Auto 2 3799\n", + "0 Corporate Auto 1 1252\n", + "1 Corporate Auto 2 1089\n", + "4 Special Auto 1 237\n", + "5 Special Auto 2 204" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complaints_summary.sort_values(by='Num Complaints', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92964ac2-32a8-4d29-92a9-7df90ba270ce", + "metadata": {}, + "outputs": [], "source": [] } ],