From a8a051960c5ba10ea5092d4ea3d7fc8d509e0c6f Mon Sep 17 00:00:00 2001
From: Antonio Gouveia <toze.sgouveia@gmail.com>
Date: Wed, 24 Sep 2025 19:38:21 +0100
Subject: [PATCH 1/2] solved lab

---
 data_cleaning.py                            | 127 +++
 lab-dw-data-structuring-and-combining.ipynb | 848 +++++++++++++++++++-
 2 files changed, 965 insertions(+), 10 deletions(-)
 create mode 100644 data_cleaning.py

diff --git a/data_cleaning.py b/data_cleaning.py
new file mode 100644
index 0000000..0f9a4e3
--- /dev/null
+++ b/data_cleaning.py
@@ -0,0 +1,127 @@
+# data_cleaning.py
+
+import pandas as pd
+import numpy as np
+
+def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardizes the column names of a DataFrame.
+    """
+    df.columns = df.columns.str.lower()
+    df.columns = df.columns.str.replace(' ', '_')
+    df = df.rename(columns={'st': 'state', 'income': 'customer_income'})
+    return df
+
+def standardize_gender(df):
+    if 'gender' in df.columns:
+        # Garantir que estamos a trabalhar com uma Series
+        gender_series = df['gender']
+
+        # Converter para string e remover nulos
+        gender_series = gender_series.astype(str).fillna('')
+
+        # Padronizar valores
+        df['gender'] = gender_series.apply(lambda x: 'F' if x.strip().upper().startswith('F') else
+                                                      'M' if x.strip().upper().startswith('M') else x)
+    return df
+
+
+def standardize_state(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardizes the 'state' column to full state names.
+    """
+    if 'state' in df.columns:
+        state_mapping = {
+            'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS', 'CA': 'CALIFORNIA', 'CALI': 'CALIFORNIA',
+            'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'FL': 'FLORIDA', 'GA': 'GEORGIA',
+            'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA',
+            'KS': 'KANSAS', 'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND',
+            'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI', 'MO': 'MISSOURI',
+            'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY',
+            'NM': 'NEW MEXICO', 'NY': 'NEW YORK', 'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO',
+            'OK': 'OKLAHOMA', 'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA',
+            'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT',
+            'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING'
+        }
+        df['state'] = df['state'].astype(str).str.upper().replace(state_mapping)
+    return df
+
+def standardize_education(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardizes the 'education' column.
+    """
+    if 'education' in df.columns:
+        df['education'] = df['education'].astype(str)
+        df.loc[df['education'].str.contains(r'^[Bb]', na=False), 'education'] = 'Bachelor'
+    return df
+
+def standardize_vehicle_class(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardizes the 'vehicle_class' column by grouping similar values.
+    """
+    if 'vehicle_class' in df.columns:
+        df['vehicle_class'] = df['vehicle_class'].astype(str)
+        df.loc[df['vehicle_class'].str.contains(r'^[Lu]', na=False), 'vehicle_class'] = 'Luxury'
+        df.loc[df['vehicle_class'].str.contains(r'\bSports\b', na=False), 'vehicle_class'] = 'Luxury'
+    return df
+
+def clean_and_convert_numerical(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Cleans and converts numerical columns.
+    """
+    if 'customer_lifetime_value' in df.columns:
+        df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(str).str.rstrip('%')
+        df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce').astype('float64')
+
+    if 'number_of_open_complaints' in df.columns:
+        df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'/(\d+)/')
+        df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').astype('Int64')
+    
+    return df
+
+def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Removes duplicate rows from the DataFrame, keeping only the first occurrence.
+    """
+    initial_rows = len(df)
+    df_cleaned = df.drop_duplicates(keep='first').reset_index(drop=True)
+    rows_removed = initial_rows - len(df_cleaned)
+    print(f"Number of duplicate rows removed: {rows_removed}")
+    return df_cleaned
+
+def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Fills in missing values in both categorical and numerical columns.
+    """
+    categorical_cols = ['customer', 'state', 'gender', 'education', 'policy_type', 'vehicle_class']
+    for col in categorical_cols:
+        if col in df.columns:
+            df[col] = df[col].fillna('Unknown')
+
+    numerical_cols = ['total_claim_amount', 'monthly_premium_auto', 'customer_income', 'customer_lifetime_value']
+    for col in numerical_cols:
+        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
+            df[col] = df[col].fillna(df[col].median())
+    
+    if 'number_of_open_complaints' in df.columns:
+        df['complaints_missing'] = df['number_of_open_complaints'].isnull()
+    
+    return df
+
+def main_cleaning_pipeline(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Main function to execute the complete data cleaning and formatting pipeline.
+    """
+    print("Starting the data cleaning and formatting pipeline...")
+    
+    df = clean_column_names(df)
+    df = standardize_gender(df)
+    df = standardize_state(df)
+    df = standardize_education(df)
+    df = standardize_vehicle_class(df)
+    df = clean_and_convert_numerical(df)
+    df = handle_missing_values(df)
+    df = remove_duplicates(df) 
+    
+    print("Data cleaning pipeline completed successfully.")
+    return df
\ No newline at end of file
diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index ec4e3f9..66876b3 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,264 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "492d06e3-92c7-4105-ac72-536db98d3244",
    "metadata": {
     "id": "492d06e3-92c7-4105-ac72-536db98d3244"
    },
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "88ca076a-1c83-4566-bd61-e9dadfa09d6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n",
+    "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n",
+    "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "60bfb74e-d199-4f19-b554-9dd27f852d83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Enable autoreload\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c3ab77d2-1e35-47f8-9274-b789cc92968c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  Import cleaning module\n",
+    "import data_cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "29099c40-9ca7-4c8b-b3b1-eb9558882156",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting the data cleaning and formatting pipeline...\n",
+      "Number of duplicate rows removed: 2936\n",
+      "Data cleaning pipeline completed successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df1_cleaned = data_cleaning.main_cleaning_pipeline(df1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1d4f3233-e292-45d0-b320-103611ba632a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting the data cleaning and formatting pipeline...\n",
+      "Number of duplicate rows removed: 0\n",
+      "Data cleaning pipeline completed successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df2_cleaned = data_cleaning.main_cleaning_pipeline(df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "92363654-4a71-4f39-ac55-22f4ec286278",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting the data cleaning and formatting pipeline...\n",
+      "Number of duplicate rows removed: 0\n",
+      "Data cleaning pipeline completed successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df3_cleaned = data_cleaning.main_cleaning_pipeline(df3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "40584279-b21c-4974-ae7d-3e89e738843b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>customer</th>\n",
+       "      <th>state</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>education</th>\n",
+       "      <th>customer_lifetime_value</th>\n",
+       "      <th>customer_income</th>\n",
+       "      <th>monthly_premium_auto</th>\n",
+       "      <th>number_of_open_complaints</th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>vehicle_class</th>\n",
+       "      <th>total_claim_amount</th>\n",
+       "      <th>complaints_missing</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>RB50392</td>\n",
+       "      <td>WASHINGTON</td>\n",
+       "      <td>nan</td>\n",
+       "      <td>Master</td>\n",
+       "      <td>588174.235</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>2.704934</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>QZ44356</td>\n",
+       "      <td>ARIZONA</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>697953.590</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>94.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>1131.464935</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>AI49188</td>\n",
+       "      <td>NEVADA</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>1288743.170</td>\n",
+       "      <td>48767.0</td>\n",
+       "      <td>108.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Two-Door Car</td>\n",
+       "      <td>566.472247</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>WW63253</td>\n",
+       "      <td>CALIFORNIA</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>764586.180</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>106.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>SUV</td>\n",
+       "      <td>529.881344</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GA49547</td>\n",
+       "      <td>WASHINGTON</td>\n",
+       "      <td>M</td>\n",
+       "      <td>High School or Below</td>\n",
+       "      <td>536307.650</td>\n",
+       "      <td>36357.0</td>\n",
+       "      <td>68.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>17.269323</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  customer       state gender             education  customer_lifetime_value  \\\n",
+       "0  RB50392  WASHINGTON    nan                Master               588174.235   \n",
+       "1  QZ44356     ARIZONA      F              Bachelor               697953.590   \n",
+       "2  AI49188      NEVADA      F              Bachelor              1288743.170   \n",
+       "3  WW63253  CALIFORNIA      M              Bachelor               764586.180   \n",
+       "4  GA49547  WASHINGTON      M  High School or Below               536307.650   \n",
+       "\n",
+       "   customer_income  monthly_premium_auto  number_of_open_complaints  \\\n",
+       "0              0.0                1000.0                          0   \n",
+       "1              0.0                  94.0                          0   \n",
+       "2          48767.0                 108.0                          0   \n",
+       "3              0.0                 106.0                          0   \n",
+       "4          36357.0                  68.0                          0   \n",
+       "\n",
+       "      policy_type  vehicle_class  total_claim_amount  complaints_missing  \n",
+       "0   Personal Auto  Four-Door Car            2.704934               False  \n",
+       "1   Personal Auto  Four-Door Car         1131.464935               False  \n",
+       "2   Personal Auto   Two-Door Car          566.472247               False  \n",
+       "3  Corporate Auto            SUV          529.881344               False  \n",
+       "4   Personal Auto  Four-Door Car           17.269323               False  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_cleaned = pd.concat([df1_cleaned, df2_cleaned, df3_cleaned], ignore_index=True)\n",
+    "df_cleaned.head()"
    ]
   },
   {
@@ -72,14 +322,229 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>unnamed:_0</th>\n",
+       "      <th>customer</th>\n",
+       "      <th>state</th>\n",
+       "      <th>customer_lifetime_value</th>\n",
+       "      <th>response</th>\n",
+       "      <th>coverage</th>\n",
+       "      <th>education</th>\n",
+       "      <th>effective_to_date</th>\n",
+       "      <th>employmentstatus</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>...</th>\n",
+       "      <th>number_of_policies</th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>policy</th>\n",
+       "      <th>renew_offer_type</th>\n",
+       "      <th>sales_channel</th>\n",
+       "      <th>total_claim_amount</th>\n",
+       "      <th>vehicle_class</th>\n",
+       "      <th>vehicle_size</th>\n",
+       "      <th>vehicle_type</th>\n",
+       "      <th>month</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>DK49336</td>\n",
+       "      <td>Arizona</td>\n",
+       "      <td>4809.216960</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-02-18</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>Corporate L3</td>\n",
+       "      <td>Offer3</td>\n",
+       "      <td>Agent</td>\n",
+       "      <td>292.800000</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>KX64629</td>\n",
+       "      <td>California</td>\n",
+       "      <td>2228.525238</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-01-18</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>F</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L3</td>\n",
+       "      <td>Offer4</td>\n",
+       "      <td>Call Center</td>\n",
+       "      <td>744.924331</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>LZ68649</td>\n",
+       "      <td>Washington</td>\n",
+       "      <td>14947.917300</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Basic</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>2011-02-10</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L3</td>\n",
+       "      <td>Offer3</td>\n",
+       "      <td>Call Center</td>\n",
+       "      <td>480.000000</td>\n",
+       "      <td>SUV</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>XL78013</td>\n",
+       "      <td>Oregon</td>\n",
+       "      <td>22332.439460</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Extended</td>\n",
+       "      <td>College</td>\n",
+       "      <td>2011-01-11</td>\n",
+       "      <td>Employed</td>\n",
+       "      <td>M</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>Corporate L3</td>\n",
+       "      <td>Offer2</td>\n",
+       "      <td>Branch</td>\n",
+       "      <td>484.013411</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>QA50777</td>\n",
+       "      <td>Oregon</td>\n",
+       "      <td>9025.067525</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Premium</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>2011-01-17</td>\n",
+       "      <td>Medical Leave</td>\n",
+       "      <td>F</td>\n",
+       "      <td>...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Personal L2</td>\n",
+       "      <td>Offer1</td>\n",
+       "      <td>Branch</td>\n",
+       "      <td>707.925645</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>Medsize</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+       "0           0  DK49336     Arizona              4809.216960       No   \n",
+       "1           1  KX64629  California              2228.525238       No   \n",
+       "2           2  LZ68649  Washington             14947.917300       No   \n",
+       "3           3  XL78013      Oregon             22332.439460      Yes   \n",
+       "4           4  QA50777      Oregon              9025.067525       No   \n",
+       "\n",
+       "   coverage education effective_to_date employmentstatus gender  ...  \\\n",
+       "0     Basic   College        2011-02-18         Employed      M  ...   \n",
+       "1     Basic   College        2011-01-18       Unemployed      F  ...   \n",
+       "2     Basic  Bachelor        2011-02-10         Employed      M  ...   \n",
+       "3  Extended   College        2011-01-11         Employed      M  ...   \n",
+       "4   Premium  Bachelor        2011-01-17    Medical Leave      F  ...   \n",
+       "\n",
+       "   number_of_policies     policy_type        policy  renew_offer_type  \\\n",
+       "0                   9  Corporate Auto  Corporate L3            Offer3   \n",
+       "1                   1   Personal Auto   Personal L3            Offer4   \n",
+       "2                   2   Personal Auto   Personal L3            Offer3   \n",
+       "3                   2  Corporate Auto  Corporate L3            Offer2   \n",
+       "4                   7   Personal Auto   Personal L2            Offer1   \n",
+       "\n",
+       "   sales_channel  total_claim_amount  vehicle_class  vehicle_size  \\\n",
+       "0          Agent          292.800000  Four-Door Car       Medsize   \n",
+       "1    Call Center          744.924331  Four-Door Car       Medsize   \n",
+       "2    Call Center          480.000000            SUV       Medsize   \n",
+       "3         Branch          484.013411  Four-Door Car       Medsize   \n",
+       "4         Branch          707.925645  Four-Door Car       Medsize   \n",
+       "\n",
+       "  vehicle_type month  \n",
+       "0            A     2  \n",
+       "1            A     1  \n",
+       "2            A     2  \n",
+       "3            A     1  \n",
+       "4            A     1  \n",
+       "\n",
+       "[5 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n",
+    "df.head()"
    ]
   },
   {
@@ -103,6 +568,87 @@
     "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6f3b98b9-4284-45bb-9838-6c95b98b6c46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10910 entries, 0 to 10909\n",
+      "Data columns (total 27 columns):\n",
+      " #   Column                         Non-Null Count  Dtype  \n",
+      "---  ------                         --------------  -----  \n",
+      " 0   unnamed:_0                     10910 non-null  int64  \n",
+      " 1   customer                       10910 non-null  object \n",
+      " 2   state                          10910 non-null  object \n",
+      " 3   customer_lifetime_value        10910 non-null  float64\n",
+      " 4   response                       10910 non-null  object \n",
+      " 5   coverage                       10910 non-null  object \n",
+      " 6   education                      10910 non-null  object \n",
+      " 7   effective_to_date              10910 non-null  object \n",
+      " 8   employmentstatus               10910 non-null  object \n",
+      " 9   gender                         10910 non-null  object \n",
+      " 10  income                         10910 non-null  int64  \n",
+      " 11  location_code                  10910 non-null  object \n",
+      " 12  marital_status                 10910 non-null  object \n",
+      " 13  monthly_premium_auto           10910 non-null  int64  \n",
+      " 14  months_since_last_claim        10910 non-null  float64\n",
+      " 15  months_since_policy_inception  10910 non-null  int64  \n",
+      " 16  number_of_open_complaints      10910 non-null  float64\n",
+      " 17  number_of_policies             10910 non-null  int64  \n",
+      " 18  policy_type                    10910 non-null  object \n",
+      " 19  policy                         10910 non-null  object \n",
+      " 20  renew_offer_type               10910 non-null  object \n",
+      " 21  sales_channel                  10910 non-null  object \n",
+      " 22  total_claim_amount             10910 non-null  float64\n",
+      " 23  vehicle_class                  10910 non-null  object \n",
+      " 24  vehicle_size                   10910 non-null  object \n",
+      " 25  vehicle_type                   10910 non-null  object \n",
+      " 26  month                          10910 non-null  int64  \n",
+      "dtypes: float64(4), int64(6), object(17)\n",
+      "memory usage: 2.2+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f8beddc0-36fc-47ad-ab92-dacc6e41f695",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "               total_claim_amount\n",
+      "sales_channel                    \n",
+      "Agent                  1810226.82\n",
+      "Branch                 1301204.00\n",
+      "Call Center             926600.82\n",
+      "Web                     706600.04\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Summary table showing the total revenue for each sales channel\n",
+    "pivot_revenue = df.pivot_table(\n",
+    "    values='total_claim_amount',\n",
+    "    index='sales_channel',\n",
+    "    aggfunc='sum'\n",
+    ").round(2)\n",
+    "\n",
+    "print(pivot_revenue)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -130,15 +676,297 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "3a069e0b-b400-470e-904d-d17582191be4",
    "metadata": {
     "id": "3a069e0b-b400-470e-904d-d17582191be4"
    },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>number_of_open_complaints</th>\n",
+       "      <th>month</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.384256</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      policy_type  number_of_open_complaints  month\n",
+       "0  Corporate Auto                   0.000000      2\n",
+       "1   Personal Auto                   0.000000      1\n",
+       "2   Personal Auto                   0.000000      2\n",
+       "3  Corporate Auto                   0.000000      1\n",
+       "4   Personal Auto                   0.384256      1"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[['policy_type', 'number_of_open_complaints', 'month']].head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "2a125542-e6f6-4be5-bae4-941ec3f9af7e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "month\n",
+       "1    5818\n",
+       "2    5092\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.month.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7558edbf-7d65-47d3-a17a-43528257af6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "number_of_open_complaints\n",
+       "0.000000    8160\n",
+       "1.000000    1145\n",
+       "0.384256     633\n",
+       "2.000000     414\n",
+       "3.000000     324\n",
+       "4.000000     166\n",
+       "5.000000      68\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.number_of_open_complaints.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "b768d078-f660-4e68-8f25-dfe69990eb1a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.        , 0.38425611, 3.        , 1.        , 2.        ,\n",
+       "       4.        , 5.        ])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['number_of_open_complaints'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "b4722727-e0bd-4f45-a816-def7f4898a3d",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "\n",
+    "df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'^(\\d)').astype(float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "fef4dd7a-2df7-4294-ac98-4def3efb9775",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "number_of_open_complaints\n",
+       "0.0    8793\n",
+       "1.0    1145\n",
+       "2.0     414\n",
+       "3.0     324\n",
+       "4.0     166\n",
+       "5.0      68\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['number_of_open_complaints'].value_counts().sort_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "0df38b71-2ac3-48c9-8cc9-60107b2e7cbf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>number_of_open_complaints</th>\n",
+       "      <th>month</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      policy_type  number_of_open_complaints  month\n",
+       "0  Corporate Auto                        0.0      2\n",
+       "1   Personal Auto                        0.0      1\n",
+       "2   Personal Auto                        0.0      2\n",
+       "3  Corporate Auto                        0.0      1\n",
+       "4   Personal Auto                        0.0      1"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[['policy_type', 'number_of_open_complaints', 'month']].head()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77303f92-45bc-4f46-9e0f-6e9eb5c0372b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -146,9 +974,9 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python [conda env:base] *",
    "language": "python",
-   "name": "python3"
+   "name": "conda-base-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -160,7 +988,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,

From 40a65dd52969ee61515d5971178ac7ccbd13c3ae Mon Sep 17 00:00:00 2001
From: Antonio Gouveia <toze.sgouveia@gmail.com>
Date: Wed, 24 Sep 2025 19:48:29 +0100
Subject: [PATCH 2/2] correction of solved lab

---
 lab-dw-data-structuring-and-combining.ipynb | 248 ++++++++++++++++++--
 1 file changed, 223 insertions(+), 25 deletions(-)

diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index 66876b3..5dd7049 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "id": "60bfb74e-d199-4f19-b554-9dd27f852d83",
    "metadata": {},
    "outputs": [],
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "c3ab77d2-1e35-47f8-9274-b789cc92968c",
    "metadata": {},
    "outputs": [],
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "29099c40-9ca7-4c8b-b3b1-eb9558882156",
    "metadata": {},
    "outputs": [
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "id": "1d4f3233-e292-45d0-b320-103611ba632a",
    "metadata": {},
    "outputs": [
@@ -124,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "id": "92363654-4a71-4f39-ac55-22f4ec286278",
    "metadata": {},
    "outputs": [
@@ -144,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "id": "40584279-b21c-4974-ae7d-3e89e738843b",
    "metadata": {},
    "outputs": [
@@ -286,7 +286,7 @@
        "4   Personal Auto  Four-Door Car           17.269323               False  "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -322,7 +322,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
@@ -537,7 +537,7 @@
        "[5 rows x 27 columns]"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -570,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "id": "6f3b98b9-4284-45bb-9838-6c95b98b6c46",
    "metadata": {},
    "outputs": [
@@ -621,7 +621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 11,
    "id": "f8beddc0-36fc-47ad-ab92-dacc6e41f695",
    "metadata": {},
    "outputs": [
@@ -649,6 +649,92 @@
     "print(pivot_revenue)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9006dfc7-e010-43ef-8bf6-9c219e12ddb7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>education</th>\n",
+       "      <th>Bachelor</th>\n",
+       "      <th>College</th>\n",
+       "      <th>Doctor</th>\n",
+       "      <th>High School or Below</th>\n",
+       "      <th>Master</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gender</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>F</th>\n",
+       "      <td>7874.27</td>\n",
+       "      <td>7748.82</td>\n",
+       "      <td>7328.51</td>\n",
+       "      <td>8675.22</td>\n",
+       "      <td>8157.05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>M</th>\n",
+       "      <td>7703.60</td>\n",
+       "      <td>8052.46</td>\n",
+       "      <td>7415.33</td>\n",
+       "      <td>8149.69</td>\n",
+       "      <td>8168.83</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "education  Bachelor  College   Doctor  High School or Below   Master\n",
+       "gender                                                              \n",
+       "F           7874.27  7748.82  7328.51               8675.22  8157.05\n",
+       "M           7703.60  8052.46  7415.33               8149.69  8168.83"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pivot_clv = df.pivot_table(\n",
+    "    values='customer_lifetime_value',\n",
+    "    index='gender',\n",
+    "    columns='education',\n",
+    "    aggfunc='mean'\n",
+    ").round(2)\n",
+    "\n",
+    "pivot_clv"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -676,7 +762,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "id": "3a069e0b-b400-470e-904d-d17582191be4",
    "metadata": {
     "id": "3a069e0b-b400-470e-904d-d17582191be4"
@@ -752,7 +838,7 @@
        "4   Personal Auto                   0.384256      1"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -763,7 +849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 16,
    "id": "2a125542-e6f6-4be5-bae4-941ec3f9af7e",
    "metadata": {},
    "outputs": [
@@ -776,7 +862,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -787,7 +873,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 17,
    "id": "7558edbf-7d65-47d3-a17a-43528257af6b",
    "metadata": {},
    "outputs": [
@@ -805,7 +891,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -816,7 +902,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 18,
    "id": "b768d078-f660-4e68-8f25-dfe69990eb1a",
    "metadata": {},
    "outputs": [
@@ -827,7 +913,7 @@
        "       4.        , 5.        ])"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -838,7 +924,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 19,
    "id": "b4722727-e0bd-4f45-a816-def7f4898a3d",
    "metadata": {},
    "outputs": [],
@@ -849,7 +935,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 20,
    "id": "fef4dd7a-2df7-4294-ac98-4def3efb9775",
    "metadata": {},
    "outputs": [
@@ -866,7 +952,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -877,7 +963,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 21,
    "id": "0df38b71-2ac3-48c9-8cc9-60107b2e7cbf",
    "metadata": {},
    "outputs": [
@@ -951,7 +1037,7 @@
        "4   Personal Auto                        0.0      1"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -962,10 +1048,122 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "77303f92-45bc-4f46-9e0f-6e9eb5c0372b",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "df_complaints = df[df['number_of_open_complaints'].notna()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "f3617030-52d8-43f8-807d-7a8bce40c8c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "complaints_summary = df_complaints.groupby(['policy_type', 'month']).size().reset_index(name='Num Complaints')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "a2bc7621-a1b1-4c86-ba78-a552073de300",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>policy_type</th>\n",
+       "      <th>month</th>\n",
+       "      <th>Num Complaints</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4329</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3799</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1252</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1089</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Special Auto</td>\n",
+       "      <td>1</td>\n",
+       "      <td>237</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Special Auto</td>\n",
+       "      <td>2</td>\n",
+       "      <td>204</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      policy_type  month  Num Complaints\n",
+       "2   Personal Auto      1            4329\n",
+       "3   Personal Auto      2            3799\n",
+       "0  Corporate Auto      1            1252\n",
+       "1  Corporate Auto      2            1089\n",
+       "4    Special Auto      1             237\n",
+       "5    Special Auto      2             204"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "complaints_summary.sort_values(by='Num Complaints', ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92964ac2-32a8-4d29-92a9-7df90ba270ce",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],