diff --git a/data_cleaning.py b/data_cleaning.py
new file mode 100644
index 0000000..0f9a4e3
--- /dev/null
+++ b/data_cleaning.py
@@ -0,0 +1,127 @@
+# data_cleaning.py
+
+import pandas as pd
+import numpy as np
+
+def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Standardizes the column names of a DataFrame.
+ """
+ df.columns = df.columns.str.lower()
+ df.columns = df.columns.str.replace(' ', '_')
+ df = df.rename(columns={'st': 'state', 'income': 'customer_income'})
+ return df
+
+def standardize_gender(df):
+ if 'gender' in df.columns:
+ # Garantir que estamos a trabalhar com uma Series
+ gender_series = df['gender']
+
+ # Converter para string e remover nulos
+ gender_series = gender_series.astype(str).fillna('')
+
+ # Padronizar valores
+ df['gender'] = gender_series.apply(lambda x: 'F' if x.strip().upper().startswith('F') else
+ 'M' if x.strip().upper().startswith('M') else x)
+ return df
+
+
+def standardize_state(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Standardizes the 'state' column to full state names.
+ """
+ if 'state' in df.columns:
+ state_mapping = {
+ 'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS', 'CA': 'CALIFORNIA', 'CALI': 'CALIFORNIA',
+ 'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'FL': 'FLORIDA', 'GA': 'GEORGIA',
+ 'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA',
+ 'KS': 'KANSAS', 'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND',
+ 'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI', 'MO': 'MISSOURI',
+ 'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY',
+ 'NM': 'NEW MEXICO', 'NY': 'NEW YORK', 'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO',
+ 'OK': 'OKLAHOMA', 'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA',
+ 'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT',
+ 'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING'
+ }
+ df['state'] = df['state'].astype(str).str.upper().replace(state_mapping)
+ return df
+
+def standardize_education(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Standardizes the 'education' column.
+ """
+ if 'education' in df.columns:
+ df['education'] = df['education'].astype(str)
+ df.loc[df['education'].str.contains(r'^[Bb]', na=False), 'education'] = 'Bachelor'
+ return df
+
+def standardize_vehicle_class(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Standardizes the 'vehicle_class' column by grouping similar values.
+ """
+ if 'vehicle_class' in df.columns:
+ df['vehicle_class'] = df['vehicle_class'].astype(str)
+ df.loc[df['vehicle_class'].str.contains(r'^[Lu]', na=False), 'vehicle_class'] = 'Luxury'
+ df.loc[df['vehicle_class'].str.contains(r'\bSports\b', na=False), 'vehicle_class'] = 'Luxury'
+ return df
+
+def clean_and_convert_numerical(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Cleans and converts numerical columns.
+ """
+ if 'customer_lifetime_value' in df.columns:
+ df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(str).str.rstrip('%')
+ df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce').astype('float64')
+
+ if 'number_of_open_complaints' in df.columns:
+ df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'/(\d+)/')
+ df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce').astype('Int64')
+
+ return df
+
+def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Removes duplicate rows from the DataFrame, keeping only the first occurrence.
+ """
+ initial_rows = len(df)
+ df_cleaned = df.drop_duplicates(keep='first').reset_index(drop=True)
+ rows_removed = initial_rows - len(df_cleaned)
+ print(f"Number of duplicate rows removed: {rows_removed}")
+ return df_cleaned
+
+def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Fills in missing values in both categorical and numerical columns.
+ """
+ categorical_cols = ['customer', 'state', 'gender', 'education', 'policy_type', 'vehicle_class']
+ for col in categorical_cols:
+ if col in df.columns:
+ df[col] = df[col].fillna('Unknown')
+
+ numerical_cols = ['total_claim_amount', 'monthly_premium_auto', 'customer_income', 'customer_lifetime_value']
+ for col in numerical_cols:
+ if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
+ df[col] = df[col].fillna(df[col].median())
+
+ if 'number_of_open_complaints' in df.columns:
+ df['complaints_missing'] = df['number_of_open_complaints'].isnull()
+
+ return df
+
+def main_cleaning_pipeline(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Main function to execute the complete data cleaning and formatting pipeline.
+ """
+ print("Starting the data cleaning and formatting pipeline...")
+
+ df = clean_column_names(df)
+ df = standardize_gender(df)
+ df = standardize_state(df)
+ df = standardize_education(df)
+ df = standardize_vehicle_class(df)
+ df = clean_and_convert_numerical(df)
+ df = handle_missing_values(df)
+ df = remove_duplicates(df)
+
+ print("Data cleaning pipeline completed successfully.")
+ return df
\ No newline at end of file
diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index ec4e3f9..5dd7049 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,264 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "492d06e3-92c7-4105-ac72-536db98d3244",
"metadata": {
"id": "492d06e3-92c7-4105-ac72-536db98d3244"
},
"outputs": [],
"source": [
- "# Your code goes here"
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "88ca076a-1c83-4566-bd61-e9dadfa09d6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n",
+ "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\")\n",
+ "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "60bfb74e-d199-4f19-b554-9dd27f852d83",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Enable autoreload\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "c3ab77d2-1e35-47f8-9274-b789cc92968c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import cleaning module\n",
+ "import data_cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "29099c40-9ca7-4c8b-b3b1-eb9558882156",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting the data cleaning and formatting pipeline...\n",
+ "Number of duplicate rows removed: 2936\n",
+ "Data cleaning pipeline completed successfully.\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_cleaned = data_cleaning.main_cleaning_pipeline(df1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "1d4f3233-e292-45d0-b320-103611ba632a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting the data cleaning and formatting pipeline...\n",
+ "Number of duplicate rows removed: 0\n",
+ "Data cleaning pipeline completed successfully.\n"
+ ]
+ }
+ ],
+ "source": [
+ "df2_cleaned = data_cleaning.main_cleaning_pipeline(df2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "92363654-4a71-4f39-ac55-22f4ec286278",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting the data cleaning and formatting pipeline...\n",
+ "Number of duplicate rows removed: 0\n",
+ "Data cleaning pipeline completed successfully.\n"
+ ]
+ }
+ ],
+ "source": [
+ "df3_cleaned = data_cleaning.main_cleaning_pipeline(df3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "40584279-b21c-4974-ae7d-3e89e738843b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer | \n",
+ " state | \n",
+ " gender | \n",
+ " education | \n",
+ " customer_lifetime_value | \n",
+ " customer_income | \n",
+ " monthly_premium_auto | \n",
+ " number_of_open_complaints | \n",
+ " policy_type | \n",
+ " vehicle_class | \n",
+ " total_claim_amount | \n",
+ " complaints_missing | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " RB50392 | \n",
+ " WASHINGTON | \n",
+ " nan | \n",
+ " Master | \n",
+ " 588174.235 | \n",
+ " 0.0 | \n",
+ " 1000.0 | \n",
+ " 0 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 2.704934 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " QZ44356 | \n",
+ " ARIZONA | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 697953.590 | \n",
+ " 0.0 | \n",
+ " 94.0 | \n",
+ " 0 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 1131.464935 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " AI49188 | \n",
+ " NEVADA | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 1288743.170 | \n",
+ " 48767.0 | \n",
+ " 108.0 | \n",
+ " 0 | \n",
+ " Personal Auto | \n",
+ " Two-Door Car | \n",
+ " 566.472247 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " WW63253 | \n",
+ " CALIFORNIA | \n",
+ " M | \n",
+ " Bachelor | \n",
+ " 764586.180 | \n",
+ " 0.0 | \n",
+ " 106.0 | \n",
+ " 0 | \n",
+ " Corporate Auto | \n",
+ " SUV | \n",
+ " 529.881344 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " GA49547 | \n",
+ " WASHINGTON | \n",
+ " M | \n",
+ " High School or Below | \n",
+ " 536307.650 | \n",
+ " 36357.0 | \n",
+ " 68.0 | \n",
+ " 0 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 17.269323 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer state gender education customer_lifetime_value \\\n",
+ "0 RB50392 WASHINGTON nan Master 588174.235 \n",
+ "1 QZ44356 ARIZONA F Bachelor 697953.590 \n",
+ "2 AI49188 NEVADA F Bachelor 1288743.170 \n",
+ "3 WW63253 CALIFORNIA M Bachelor 764586.180 \n",
+ "4 GA49547 WASHINGTON M High School or Below 536307.650 \n",
+ "\n",
+ " customer_income monthly_premium_auto number_of_open_complaints \\\n",
+ "0 0.0 1000.0 0 \n",
+ "1 0.0 94.0 0 \n",
+ "2 48767.0 108.0 0 \n",
+ "3 0.0 106.0 0 \n",
+ "4 36357.0 68.0 0 \n",
+ "\n",
+ " policy_type vehicle_class total_claim_amount complaints_missing \n",
+ "0 Personal Auto Four-Door Car 2.704934 False \n",
+ "1 Personal Auto Four-Door Car 1131.464935 False \n",
+ "2 Personal Auto Two-Door Car 566.472247 False \n",
+ "3 Corporate Auto SUV 529.881344 False \n",
+ "4 Personal Auto Four-Door Car 17.269323 False "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_cleaned = pd.concat([df1_cleaned, df2_cleaned, df3_cleaned], ignore_index=True)\n",
+ "df_cleaned.head()"
]
},
{
@@ -72,14 +322,229 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
"metadata": {
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unnamed:_0 | \n",
+ " customer | \n",
+ " state | \n",
+ " customer_lifetime_value | \n",
+ " response | \n",
+ " coverage | \n",
+ " education | \n",
+ " effective_to_date | \n",
+ " employmentstatus | \n",
+ " gender | \n",
+ " ... | \n",
+ " number_of_policies | \n",
+ " policy_type | \n",
+ " policy | \n",
+ " renew_offer_type | \n",
+ " sales_channel | \n",
+ " total_claim_amount | \n",
+ " vehicle_class | \n",
+ " vehicle_size | \n",
+ " vehicle_type | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " DK49336 | \n",
+ " Arizona | \n",
+ " 4809.216960 | \n",
+ " No | \n",
+ " Basic | \n",
+ " College | \n",
+ " 2011-02-18 | \n",
+ " Employed | \n",
+ " M | \n",
+ " ... | \n",
+ " 9 | \n",
+ " Corporate Auto | \n",
+ " Corporate L3 | \n",
+ " Offer3 | \n",
+ " Agent | \n",
+ " 292.800000 | \n",
+ " Four-Door Car | \n",
+ " Medsize | \n",
+ " A | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " KX64629 | \n",
+ " California | \n",
+ " 2228.525238 | \n",
+ " No | \n",
+ " Basic | \n",
+ " College | \n",
+ " 2011-01-18 | \n",
+ " Unemployed | \n",
+ " F | \n",
+ " ... | \n",
+ " 1 | \n",
+ " Personal Auto | \n",
+ " Personal L3 | \n",
+ " Offer4 | \n",
+ " Call Center | \n",
+ " 744.924331 | \n",
+ " Four-Door Car | \n",
+ " Medsize | \n",
+ " A | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " LZ68649 | \n",
+ " Washington | \n",
+ " 14947.917300 | \n",
+ " No | \n",
+ " Basic | \n",
+ " Bachelor | \n",
+ " 2011-02-10 | \n",
+ " Employed | \n",
+ " M | \n",
+ " ... | \n",
+ " 2 | \n",
+ " Personal Auto | \n",
+ " Personal L3 | \n",
+ " Offer3 | \n",
+ " Call Center | \n",
+ " 480.000000 | \n",
+ " SUV | \n",
+ " Medsize | \n",
+ " A | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " XL78013 | \n",
+ " Oregon | \n",
+ " 22332.439460 | \n",
+ " Yes | \n",
+ " Extended | \n",
+ " College | \n",
+ " 2011-01-11 | \n",
+ " Employed | \n",
+ " M | \n",
+ " ... | \n",
+ " 2 | \n",
+ " Corporate Auto | \n",
+ " Corporate L3 | \n",
+ " Offer2 | \n",
+ " Branch | \n",
+ " 484.013411 | \n",
+ " Four-Door Car | \n",
+ " Medsize | \n",
+ " A | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " QA50777 | \n",
+ " Oregon | \n",
+ " 9025.067525 | \n",
+ " No | \n",
+ " Premium | \n",
+ " Bachelor | \n",
+ " 2011-01-17 | \n",
+ " Medical Leave | \n",
+ " F | \n",
+ " ... | \n",
+ " 7 | \n",
+ " Personal Auto | \n",
+ " Personal L2 | \n",
+ " Offer1 | \n",
+ " Branch | \n",
+ " 707.925645 | \n",
+ " Four-Door Car | \n",
+ " Medsize | \n",
+ " A | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 27 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " unnamed:_0 customer state customer_lifetime_value response \\\n",
+ "0 0 DK49336 Arizona 4809.216960 No \n",
+ "1 1 KX64629 California 2228.525238 No \n",
+ "2 2 LZ68649 Washington 14947.917300 No \n",
+ "3 3 XL78013 Oregon 22332.439460 Yes \n",
+ "4 4 QA50777 Oregon 9025.067525 No \n",
+ "\n",
+ " coverage education effective_to_date employmentstatus gender ... \\\n",
+ "0 Basic College 2011-02-18 Employed M ... \n",
+ "1 Basic College 2011-01-18 Unemployed F ... \n",
+ "2 Basic Bachelor 2011-02-10 Employed M ... \n",
+ "3 Extended College 2011-01-11 Employed M ... \n",
+ "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n",
+ "\n",
+ " number_of_policies policy_type policy renew_offer_type \\\n",
+ "0 9 Corporate Auto Corporate L3 Offer3 \n",
+ "1 1 Personal Auto Personal L3 Offer4 \n",
+ "2 2 Personal Auto Personal L3 Offer3 \n",
+ "3 2 Corporate Auto Corporate L3 Offer2 \n",
+ "4 7 Personal Auto Personal L2 Offer1 \n",
+ "\n",
+ " sales_channel total_claim_amount vehicle_class vehicle_size \\\n",
+ "0 Agent 292.800000 Four-Door Car Medsize \n",
+ "1 Call Center 744.924331 Four-Door Car Medsize \n",
+ "2 Call Center 480.000000 SUV Medsize \n",
+ "3 Branch 484.013411 Four-Door Car Medsize \n",
+ "4 Branch 707.925645 Four-Door Car Medsize \n",
+ "\n",
+ " vehicle_type month \n",
+ "0 A 2 \n",
+ "1 A 1 \n",
+ "2 A 2 \n",
+ "3 A 1 \n",
+ "4 A 1 \n",
+ "\n",
+ "[5 rows x 27 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code goes here"
+ "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n",
+ "df.head()"
]
},
{
@@ -103,6 +568,173 @@
"2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "6f3b98b9-4284-45bb-9838-6c95b98b6c46",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 10910 entries, 0 to 10909\n",
+ "Data columns (total 27 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 unnamed:_0 10910 non-null int64 \n",
+ " 1 customer 10910 non-null object \n",
+ " 2 state 10910 non-null object \n",
+ " 3 customer_lifetime_value 10910 non-null float64\n",
+ " 4 response 10910 non-null object \n",
+ " 5 coverage 10910 non-null object \n",
+ " 6 education 10910 non-null object \n",
+ " 7 effective_to_date 10910 non-null object \n",
+ " 8 employmentstatus 10910 non-null object \n",
+ " 9 gender 10910 non-null object \n",
+ " 10 income 10910 non-null int64 \n",
+ " 11 location_code 10910 non-null object \n",
+ " 12 marital_status 10910 non-null object \n",
+ " 13 monthly_premium_auto 10910 non-null int64 \n",
+ " 14 months_since_last_claim 10910 non-null float64\n",
+ " 15 months_since_policy_inception 10910 non-null int64 \n",
+ " 16 number_of_open_complaints 10910 non-null float64\n",
+ " 17 number_of_policies 10910 non-null int64 \n",
+ " 18 policy_type 10910 non-null object \n",
+ " 19 policy 10910 non-null object \n",
+ " 20 renew_offer_type 10910 non-null object \n",
+ " 21 sales_channel 10910 non-null object \n",
+ " 22 total_claim_amount 10910 non-null float64\n",
+ " 23 vehicle_class 10910 non-null object \n",
+ " 24 vehicle_size 10910 non-null object \n",
+ " 25 vehicle_type 10910 non-null object \n",
+ " 26 month 10910 non-null int64 \n",
+ "dtypes: float64(4), int64(6), object(17)\n",
+ "memory usage: 2.2+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "f8beddc0-36fc-47ad-ab92-dacc6e41f695",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " total_claim_amount\n",
+ "sales_channel \n",
+ "Agent 1810226.82\n",
+ "Branch 1301204.00\n",
+ "Call Center 926600.82\n",
+ "Web 706600.04\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Summary table showing the total revenue for each sales channel\n",
+ "pivot_revenue = df.pivot_table(\n",
+ " values='total_claim_amount',\n",
+ " index='sales_channel',\n",
+ " aggfunc='sum'\n",
+ ").round(2)\n",
+ "\n",
+ "print(pivot_revenue)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "9006dfc7-e010-43ef-8bf6-9c219e12ddb7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " education | \n",
+ " Bachelor | \n",
+ " College | \n",
+ " Doctor | \n",
+ " High School or Below | \n",
+ " Master | \n",
+ "
\n",
+ " \n",
+ " gender | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " F | \n",
+ " 7874.27 | \n",
+ " 7748.82 | \n",
+ " 7328.51 | \n",
+ " 8675.22 | \n",
+ " 8157.05 | \n",
+ "
\n",
+ " \n",
+ " M | \n",
+ " 7703.60 | \n",
+ " 8052.46 | \n",
+ " 7415.33 | \n",
+ " 8149.69 | \n",
+ " 8168.83 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "education Bachelor College Doctor High School or Below Master\n",
+ "gender \n",
+ "F 7874.27 7748.82 7328.51 8675.22 8157.05\n",
+ "M 7703.60 8052.46 7415.33 8149.69 8168.83"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pivot_clv = df.pivot_table(\n",
+ " values='customer_lifetime_value',\n",
+ " index='gender',\n",
+ " columns='education',\n",
+ " aggfunc='mean'\n",
+ ").round(2)\n",
+ "\n",
+ "pivot_clv"
+ ]
+ },
{
"cell_type": "markdown",
"id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -130,15 +762,409 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"id": "3a069e0b-b400-470e-904d-d17582191be4",
"metadata": {
"id": "3a069e0b-b400-470e-904d-d17582191be4"
},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " policy_type | \n",
+ " number_of_open_complaints | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Corporate Auto | \n",
+ " 0.000000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Personal Auto | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Personal Auto | \n",
+ " 0.000000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Corporate Auto | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Personal Auto | \n",
+ " 0.384256 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " policy_type number_of_open_complaints month\n",
+ "0 Corporate Auto 0.000000 2\n",
+ "1 Personal Auto 0.000000 1\n",
+ "2 Personal Auto 0.000000 2\n",
+ "3 Corporate Auto 0.000000 1\n",
+ "4 Personal Auto 0.384256 1"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[['policy_type', 'number_of_open_complaints', 'month']].head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "2a125542-e6f6-4be5-bae4-941ec3f9af7e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "month\n",
+ "1 5818\n",
+ "2 5092\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.month.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "7558edbf-7d65-47d3-a17a-43528257af6b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "number_of_open_complaints\n",
+ "0.000000 8160\n",
+ "1.000000 1145\n",
+ "0.384256 633\n",
+ "2.000000 414\n",
+ "3.000000 324\n",
+ "4.000000 166\n",
+ "5.000000 68\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.number_of_open_complaints.value_counts().head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "b768d078-f660-4e68-8f25-dfe69990eb1a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0. , 0.38425611, 3. , 1. , 2. ,\n",
+ " 4. , 5. ])"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['number_of_open_complaints'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b4722727-e0bd-4f45-a816-def7f4898a3d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(str).str.extract(r'^(\\d)').astype(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "fef4dd7a-2df7-4294-ac98-4def3efb9775",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "number_of_open_complaints\n",
+ "0.0 8793\n",
+ "1.0 1145\n",
+ "2.0 414\n",
+ "3.0 324\n",
+ "4.0 166\n",
+ "5.0 68\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['number_of_open_complaints'].value_counts().sort_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "0df38b71-2ac3-48c9-8cc9-60107b2e7cbf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " policy_type | \n",
+ " number_of_open_complaints | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Corporate Auto | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Personal Auto | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Personal Auto | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Corporate Auto | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Personal Auto | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " policy_type number_of_open_complaints month\n",
+ "0 Corporate Auto 0.0 2\n",
+ "1 Personal Auto 0.0 1\n",
+ "2 Personal Auto 0.0 2\n",
+ "3 Corporate Auto 0.0 1\n",
+ "4 Personal Auto 0.0 1"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[['policy_type', 'number_of_open_complaints', 'month']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "77303f92-45bc-4f46-9e0f-6e9eb5c0372b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_complaints = df[df['number_of_open_complaints'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "f3617030-52d8-43f8-807d-7a8bce40c8c9",
+ "metadata": {},
"outputs": [],
"source": [
- "# Your code goes here"
+ "complaints_summary = df_complaints.groupby(['policy_type', 'month']).size().reset_index(name='Num Complaints')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "a2bc7621-a1b1-4c86-ba78-a552073de300",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " policy_type | \n",
+ " month | \n",
+ " Num Complaints | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " Personal Auto | \n",
+ " 1 | \n",
+ " 4329 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Personal Auto | \n",
+ " 2 | \n",
+ " 3799 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Corporate Auto | \n",
+ " 1 | \n",
+ " 1252 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Corporate Auto | \n",
+ " 2 | \n",
+ " 1089 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Special Auto | \n",
+ " 1 | \n",
+ " 237 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Special Auto | \n",
+ " 2 | \n",
+ " 204 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " policy_type month Num Complaints\n",
+ "2 Personal Auto 1 4329\n",
+ "3 Personal Auto 2 3799\n",
+ "0 Corporate Auto 1 1252\n",
+ "1 Corporate Auto 2 1089\n",
+ "4 Special Auto 1 237\n",
+ "5 Special Auto 2 204"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "complaints_summary.sort_values(by='Num Complaints', ascending=False)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "92964ac2-32a8-4d29-92a9-7df90ba270ce",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -146,9 +1172,9 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python [conda env:base] *",
"language": "python",
- "name": "python3"
+ "name": "conda-base-py"
},
"language_info": {
"codemirror_mode": {
@@ -160,7 +1186,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.13.5"
}
},
"nbformat": 4,