From 045d98c4cf8a5bd9c3227785ab4c92fab8c2dd82 Mon Sep 17 00:00:00 2001
From: Miguel Florindo <miguelflorindo0gmail.com>
Date: Sat, 27 Sep 2025 15:12:56 +0100
Subject: [PATCH] finished Lab

---
 lab-dw-data-structuring-and-combining.ipynb | 613 +++++++++++++++++++-
 1 file changed, 606 insertions(+), 7 deletions(-)

diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index ec4e3f9..b75b673 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "492d06e3-92c7-4105-ac72-536db98d3244",
    "metadata": {
     "id": "492d06e3-92c7-4105-ac72-536db98d3244"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Customer          ST GENDER             Education Customer Lifetime Value  \\\n",
+      "0  RB50392  Washington    NaN                Master                     NaN   \n",
+      "1  QZ44356     Arizona      F              Bachelor              697953.59%   \n",
+      "2  AI49188      Nevada      F              Bachelor             1288743.17%   \n",
+      "3  WW63253  California      M              Bachelor              764586.18%   \n",
+      "4  GA49547  Washington      M  High School or Below              536307.65%   \n",
+      "\n",
+      "    Income  Monthly Premium Auto Number of Open Complaints     Policy Type  \\\n",
+      "0      0.0                1000.0                    1/0/00   Personal Auto   \n",
+      "1      0.0                  94.0                    1/0/00   Personal Auto   \n",
+      "2  48767.0                 108.0                    1/0/00   Personal Auto   \n",
+      "3      0.0                 106.0                    1/0/00  Corporate Auto   \n",
+      "4  36357.0                  68.0                    1/0/00   Personal Auto   \n",
+      "\n",
+      "   Vehicle Class  Total Claim Amount State Gender  \n",
+      "0  Four-Door Car            2.704934   NaN    NaN  \n",
+      "1  Four-Door Car         1131.464935   NaN    NaN  \n",
+      "2   Two-Door Car          566.472247   NaN    NaN  \n",
+      "3            SUV          529.881344   NaN    NaN  \n",
+      "4  Four-Door Car           17.269323   NaN    NaN  \n",
+      "Total cleaned rows: 9135\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "urls = [\n",
+    "    \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\",\n",
+    "    \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\",\n",
+    "    \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n",
+    "]\n",
+    "dataframes = []\n",
+    "for url in urls:\n",
+    "    df = pd.read_csv(url)\n",
+    "    dataframes.append(df)\n",
+    "\n",
+    "combined_df = pd.concat(dataframes, ignore_index=True)\n",
+    "\n",
+    "combined_df.drop_duplicates(inplace=True)\n",
+    "\n",
+    "combined_df.fillna({\n",
+    "    'column_a': 'Unknown',\n",
+    "    'column_b': 0\n",
+    "}, inplace=True)\n",
+    "\n",
+    "print(combined_df.head())\n",
+    "print(f\"Total cleaned rows: {len(combined_df)}\")"
    ]
   },
   {
@@ -72,14 +122,505 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
    },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First few rows:\n",
+      "   unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+      "0           0  DK49336     Arizona              4809.216960       No   \n",
+      "1           1  KX64629  California              2228.525238       No   \n",
+      "2           2  LZ68649  Washington             14947.917300       No   \n",
+      "3           3  XL78013      Oregon             22332.439460      Yes   \n",
+      "4           4  QA50777      Oregon              9025.067525       No   \n",
+      "\n",
+      "   coverage education effective_to_date employmentstatus gender  ...  \\\n",
+      "0     Basic   College        2011-02-18         Employed      M  ...   \n",
+      "1     Basic   College        2011-01-18       Unemployed      F  ...   \n",
+      "2     Basic  Bachelor        2011-02-10         Employed      M  ...   \n",
+      "3  Extended   College        2011-01-11         Employed      M  ...   \n",
+      "4   Premium  Bachelor        2011-01-17    Medical Leave      F  ...   \n",
+      "\n",
+      "   number_of_policies     policy_type        policy  renew_offer_type  \\\n",
+      "0                   9  Corporate Auto  Corporate L3            Offer3   \n",
+      "1                   1   Personal Auto   Personal L3            Offer4   \n",
+      "2                   2   Personal Auto   Personal L3            Offer3   \n",
+      "3                   2  Corporate Auto  Corporate L3            Offer2   \n",
+      "4                   7   Personal Auto   Personal L2            Offer1   \n",
+      "\n",
+      "   sales_channel  total_claim_amount  vehicle_class  vehicle_size  \\\n",
+      "0          Agent          292.800000  Four-Door Car       Medsize   \n",
+      "1    Call Center          744.924331  Four-Door Car       Medsize   \n",
+      "2    Call Center          480.000000            SUV       Medsize   \n",
+      "3         Branch          484.013411  Four-Door Car       Medsize   \n",
+      "4         Branch          707.925645  Four-Door Car       Medsize   \n",
+      "\n",
+      "  vehicle_type month  \n",
+      "0            A     2  \n",
+      "1            A     1  \n",
+      "2            A     2  \n",
+      "3            A     1  \n",
+      "4            A     1  \n",
+      "\n",
+      "[5 rows x 27 columns]\n",
+      "\n",
+      "Dataset info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10910 entries, 0 to 10909\n",
+      "Data columns (total 27 columns):\n",
+      " #   Column                         Non-Null Count  Dtype  \n",
+      "---  ------                         --------------  -----  \n",
+      " 0   unnamed:_0                     10910 non-null  int64  \n",
+      " 1   customer                       10910 non-null  object \n",
+      " 2   state                          10910 non-null  object \n",
+      " 3   customer_lifetime_value        10910 non-null  float64\n",
+      " 4   response                       10910 non-null  object \n",
+      " 5   coverage                       10910 non-null  object \n",
+      " 6   education                      10910 non-null  object \n",
+      " 7   effective_to_date              10910 non-null  object \n",
+      " 8   employmentstatus               10910 non-null  object \n",
+      " 9   gender                         10910 non-null  object \n",
+      " 10  income                         10910 non-null  int64  \n",
+      " 11  location_code                  10910 non-null  object \n",
+      " 12  marital_status                 10910 non-null  object \n",
+      " 13  monthly_premium_auto           10910 non-null  int64  \n",
+      " 14  months_since_last_claim        10910 non-null  float64\n",
+      " 15  months_since_policy_inception  10910 non-null  int64  \n",
+      " 16  number_of_open_complaints      10910 non-null  float64\n",
+      " 17  number_of_policies             10910 non-null  int64  \n",
+      " 18  policy_type                    10910 non-null  object \n",
+      " 19  policy                         10910 non-null  object \n",
+      " 20  renew_offer_type               10910 non-null  object \n",
+      " 21  sales_channel                  10910 non-null  object \n",
+      " 22  total_claim_amount             10910 non-null  float64\n",
+      " 23  vehicle_class                  10910 non-null  object \n",
+      " 24  vehicle_size                   10910 non-null  object \n",
+      " 25  vehicle_type                   10910 non-null  object \n",
+      " 26  month                          10910 non-null  int64  \n",
+      "dtypes: float64(4), int64(6), object(17)\n",
+      "memory usage: 2.2+ MB\n",
+      "None\n",
+      "\n",
+      "Summary statistics:\n",
+      "         unnamed:_0  customer_lifetime_value        income  \\\n",
+      "count  10910.000000             10910.000000  10910.000000   \n",
+      "mean    5454.500000              8018.241094  37536.284785   \n",
+      "std     3149.590053              6885.081434  30359.195670   \n",
+      "min        0.000000              1898.007675      0.000000   \n",
+      "25%     2727.250000              4014.453113      0.000000   \n",
+      "50%     5454.500000              5771.147235  33813.500000   \n",
+      "75%     8181.750000              8992.779137  62250.750000   \n",
+      "max    10909.000000             83325.381190  99981.000000   \n",
+      "\n",
+      "       monthly_premium_auto  months_since_last_claim  \\\n",
+      "count          10910.000000             10910.000000   \n",
+      "mean              93.196059                15.149071   \n",
+      "std               34.442532                 9.783520   \n",
+      "min               61.000000                 0.000000   \n",
+      "25%               68.000000                 7.000000   \n",
+      "50%               83.000000                15.000000   \n",
+      "75%              109.000000                23.000000   \n",
+      "max              298.000000                35.000000   \n",
+      "\n",
+      "       months_since_policy_inception  number_of_open_complaints  \\\n",
+      "count                   10910.000000               10910.000000   \n",
+      "mean                       48.091934                   0.384256   \n",
+      "std                        27.940675                   0.885589   \n",
+      "min                         0.000000                   0.000000   \n",
+      "25%                        24.000000                   0.000000   \n",
+      "50%                        48.000000                   0.000000   \n",
+      "75%                        71.000000                   0.384256   \n",
+      "max                        99.000000                   5.000000   \n",
+      "\n",
+      "       number_of_policies  total_claim_amount         month  \n",
+      "count        10910.000000        10910.000000  10910.000000  \n",
+      "mean             2.979193          434.888330      1.466728  \n",
+      "std              2.399359          292.180556      0.498915  \n",
+      "min              1.000000            0.099007      1.000000  \n",
+      "25%              1.000000          271.082527      1.000000  \n",
+      "50%              2.000000          382.564630      1.000000  \n",
+      "75%              4.000000          547.200000      2.000000  \n",
+      "max              9.000000         2893.239678      2.000000  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Load the dataset directly from the URL\n",
+    "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+    "df = pd.read_csv(url)\n",
+    "\n",
+    "# Display basic info\n",
+    "print(\"First few rows:\")\n",
+    "print(df.head())\n",
+    "\n",
+    "print(\"\\nDataset info:\")\n",
+    "print(df.info())\n",
+    "\n",
+    "print(\"\\nSummary statistics:\")\n",
+    "print(df.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5af0f538",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Shape: (10910, 27)\n",
+      "Columns: ['unnamed:_0', 'customer', 'state', 'customer_lifetime_value', 'response', 'coverage', 'education', 'effective_to_date', 'employmentstatus', 'gender', 'income', 'location_code', 'marital_status', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size', 'vehicle_type', 'month']\n",
+      "\n",
+      "Missing values per column:\n",
+      "unnamed:_0                       0\n",
+      "customer                         0\n",
+      "state                            0\n",
+      "customer_lifetime_value          0\n",
+      "response                         0\n",
+      "coverage                         0\n",
+      "education                        0\n",
+      "effective_to_date                0\n",
+      "employmentstatus                 0\n",
+      "gender                           0\n",
+      "income                           0\n",
+      "location_code                    0\n",
+      "marital_status                   0\n",
+      "monthly_premium_auto             0\n",
+      "months_since_last_claim          0\n",
+      "months_since_policy_inception    0\n",
+      "number_of_open_complaints        0\n",
+      "number_of_policies               0\n",
+      "policy_type                      0\n",
+      "policy                           0\n",
+      "renew_offer_type                 0\n",
+      "sales_channel                    0\n",
+      "total_claim_amount               0\n",
+      "vehicle_class                    0\n",
+      "vehicle_size                     0\n",
+      "vehicle_type                     0\n",
+      "month                            0\n",
+      "dtype: int64\n",
+      "\n",
+      "Duplicate rows: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Shape: {df.shape}\")\n",
+    "print(f\"Columns: {list(df.columns)}\")\n",
+    "\n",
+    "# Check for missing values\n",
+    "print(\"\\nMissing values per column:\")\n",
+    "print(df.isnull().sum())\n",
+    "\n",
+    "# Check duplicates\n",
+    "print(f\"\\nDuplicate rows: {df.duplicated().sum()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "dc41ba64",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "df.columns = df.columns.str.lower().str.replace(' ', '_')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ede5ccd2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "unnamed:_0                       0\n",
+      "customer                         0\n",
+      "state                            0\n",
+      "customer_lifetime_value          0\n",
+      "response                         0\n",
+      "coverage                         0\n",
+      "education                        0\n",
+      "effective_to_date                0\n",
+      "employmentstatus                 0\n",
+      "gender                           0\n",
+      "income                           0\n",
+      "location_code                    0\n",
+      "marital_status                   0\n",
+      "monthly_premium_auto             0\n",
+      "months_since_last_claim          0\n",
+      "months_since_policy_inception    0\n",
+      "number_of_open_complaints        0\n",
+      "number_of_policies               0\n",
+      "policy_type                      0\n",
+      "policy                           0\n",
+      "renew_offer_type                 0\n",
+      "sales_channel                    0\n",
+      "total_claim_amount               0\n",
+      "vehicle_class                    0\n",
+      "vehicle_size                     0\n",
+      "vehicle_type                     0\n",
+      "month                            0\n",
+      "dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.isnull().sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "99fe37c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if 'vehicle_type' in df.columns:\n",
+    "    if df['vehicle_type'].isnull().sum() / len(df) > 0.8:\n",
+    "        df.drop('vehicle_type', axis=1, inplace=True)\n",
+    "        print(\"Dropped 'vehicle_type' due to high missingness.\")\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e484ac45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\migue\\AppData\\Local\\Temp\\ipykernel_26136\\1209922921.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[col].fillna(df[col].median(), inplace=True)\n",
+      "C:\\Users\\migue\\AppData\\Local\\Temp\\ipykernel_26136\\1209922921.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  df[col].fillna(df[col].mode()[0], inplace=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Fill missing numeric columns with median\n",
+    "numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
+    "for col in numeric_cols:\n",
+    "    df[col].fillna(df[col].median(), inplace=True)\n",
+    "\n",
+    "# Fill missing categorical columns with mode\n",
+    "categorical_cols = df.select_dtypes(include=['object']).columns\n",
+    "for col in categorical_cols:\n",
+    "    if df[col].notnull().any():  # avoid all-NaN case\n",
+    "        df[col].fillna(df[col].mode()[0], inplace=True)\n",
+    "\n",
+    "# Convert 'effective_to_date' to datetime\n",
+    "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'], errors='coerce')\n",
+    "\n",
+    "# Optional: Extract month for later analysis\n",
+    "df['effective_month'] = df['effective_to_date'].dt.month"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "130e0a58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in categorical_cols:\n",
+    "    df[col] = df[col].astype('category')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c018edbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in df.select_dtypes(include='object').columns:\n",
+    "    if df[col].dtype == 'object':\n",
+    "        df[col] = df[col].str.strip().str.title()  # Title case"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c12c4b4b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cleaned Data Info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10910 entries, 0 to 10909\n",
+      "Data columns (total 28 columns):\n",
+      " #   Column                         Non-Null Count  Dtype   \n",
+      "---  ------                         --------------  -----   \n",
+      " 0   unnamed:_0                     10910 non-null  int64   \n",
+      " 1   customer                       10910 non-null  category\n",
+      " 2   state                          10910 non-null  category\n",
+      " 3   customer_lifetime_value        10910 non-null  float64 \n",
+      " 4   response                       10910 non-null  category\n",
+      " 5   coverage                       10910 non-null  category\n",
+      " 6   education                      10910 non-null  category\n",
+      " 7   effective_to_date              10910 non-null  category\n",
+      " 8   employmentstatus               10910 non-null  category\n",
+      " 9   gender                         10910 non-null  category\n",
+      " 10  income                         10910 non-null  int64   \n",
+      " 11  location_code                  10910 non-null  category\n",
+      " 12  marital_status                 10910 non-null  category\n",
+      " 13  monthly_premium_auto           10910 non-null  int64   \n",
+      " 14  months_since_last_claim        10910 non-null  float64 \n",
+      " 15  months_since_policy_inception  10910 non-null  int64   \n",
+      " 16  number_of_open_complaints      10910 non-null  float64 \n",
+      " 17  number_of_policies             10910 non-null  int64   \n",
+      " 18  policy_type                    10910 non-null  category\n",
+      " 19  policy                         10910 non-null  category\n",
+      " 20  renew_offer_type               10910 non-null  category\n",
+      " 21  sales_channel                  10910 non-null  category\n",
+      " 22  total_claim_amount             10910 non-null  float64 \n",
+      " 23  vehicle_class                  10910 non-null  category\n",
+      " 24  vehicle_size                   10910 non-null  category\n",
+      " 25  vehicle_type                   10910 non-null  category\n",
+      " 26  month                          10910 non-null  int64   \n",
+      " 27  effective_month                10910 non-null  int32   \n",
+      "dtypes: category(17), float64(4), int32(1), int64(6)\n",
+      "memory usage: 1.4 MB\n",
+      "None\n",
+      "\n",
+      "Cleaned Data Sample:\n",
+      "   unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+      "0           0  DK49336     Arizona              4809.216960       No   \n",
+      "1           1  KX64629  California              2228.525238       No   \n",
+      "2           2  LZ68649  Washington             14947.917300       No   \n",
+      "3           3  XL78013      Oregon             22332.439460      Yes   \n",
+      "4           4  QA50777      Oregon              9025.067525       No   \n",
+      "\n",
+      "   coverage education effective_to_date employmentstatus gender  ...  \\\n",
+      "0     Basic   College        2011-02-18         Employed      M  ...   \n",
+      "1     Basic   College        2011-01-18       Unemployed      F  ...   \n",
+      "2     Basic  Bachelor        2011-02-10         Employed      M  ...   \n",
+      "3  Extended   College        2011-01-11         Employed      M  ...   \n",
+      "4   Premium  Bachelor        2011-01-17    Medical Leave      F  ...   \n",
+      "\n",
+      "      policy_type        policy renew_offer_type  sales_channel  \\\n",
+      "0  Corporate Auto  Corporate L3           Offer3          Agent   \n",
+      "1   Personal Auto   Personal L3           Offer4    Call Center   \n",
+      "2   Personal Auto   Personal L3           Offer3    Call Center   \n",
+      "3  Corporate Auto  Corporate L3           Offer2         Branch   \n",
+      "4   Personal Auto   Personal L2           Offer1         Branch   \n",
+      "\n",
+      "   total_claim_amount  vehicle_class  vehicle_size  vehicle_type month  \\\n",
+      "0          292.800000  Four-Door Car       Medsize             A     2   \n",
+      "1          744.924331  Four-Door Car       Medsize             A     1   \n",
+      "2          480.000000            SUV       Medsize             A     2   \n",
+      "3          484.013411  Four-Door Car       Medsize             A     1   \n",
+      "4          707.925645  Four-Door Car       Medsize             A     1   \n",
+      "\n",
+      "  effective_month  \n",
+      "0               2  \n",
+      "1               1  \n",
+      "2               2  \n",
+      "3               1  \n",
+      "4               1  \n",
+      "\n",
+      "[5 rows x 28 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Cleaned Data Info:\")\n",
+    "print(df.info())\n",
+    "\n",
+    "print(\"\\nCleaned Data Sample:\")\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b42d04d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n",
+      "       'response', 'coverage', 'education', 'effective_to_date',\n",
+      "       'employmentstatus', 'gender', 'income', 'location_code',\n",
+      "       'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n",
+      "       'months_since_policy_inception', 'number_of_open_complaints',\n",
+      "       'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n",
+      "       'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n",
+      "       'vehicle_type', 'month', 'effective_month'],\n",
+      "      dtype='object')\n",
+      "               total_revenue\n",
+      "sales_channel               \n",
+      "Agent             1810226.82\n",
+      "Branch            1301204.00\n",
+      "Call Center        926600.82\n",
+      "Web                706600.04\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\migue\\AppData\\Local\\Temp\\ipykernel_26136\\3335164488.py:6: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior\n",
+      "  pivot_revenue = pd.pivot_table(\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.columns)\n",
+    "\n",
+    "df['estimated_yearly_premium'] = df['monthly_premium_auto'] * 12\n",
+    "\n",
+    "# Create pivot table: sum of total_claim_amount by sales_channel\n",
+    "pivot_revenue = pd.pivot_table(\n",
+    "    df,\n",
+    "    values='total_claim_amount',\n",
+    "    index='sales_channel',\n",
+    "    aggfunc='sum'\n",
+    ")\n",
+    "\n",
+    "# Round to 2 decimal places\n",
+    "pivot_revenue['total_claim_amount'] = pivot_revenue['total_claim_amount'].round(2)\n",
+    "\n",
+    "# Sort from highest to lowest\n",
+    "pivot_revenue = pivot_revenue.sort_values(by='total_claim_amount', ascending=False)\n",
+    "\n",
+    "# Rename column for clarity\n",
+    "pivot_revenue.rename(columns={'total_claim_amount': 'total_revenue'}, inplace=True)\n",
+    "\n",
+    "print(pivot_revenue)\n"
    ]
   },
   {
@@ -103,6 +644,64 @@
     "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ace7f314",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Customer Lifetime Value by Education and Gender:\n",
+      "gender                      F        M\n",
+      "education                             \n",
+      "High School or Below  8675.22  8149.69\n",
+      "Bachelor              7874.27  7703.60\n",
+      "College               7748.82  8052.46\n",
+      "Master                8157.05  8168.83\n",
+      "Doctor                7328.51  7415.33\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\migue\\AppData\\Local\\Temp\\ipykernel_26136\\2550185581.py:1: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior\n",
+      "  pivot_clv = pd.pivot_table(\n"
+     ]
+    }
+   ],
+   "source": [
+    "pivot_clv = pd.pivot_table(\n",
+    "    df,\n",
+    "    values='customer_lifetime_value',      # Metric to aggregate\n",
+    "    index='education',                     # Rows: Education level\n",
+    "    columns='gender',                      # Columns: Gender\n",
+    "    aggfunc='mean'                         # Summary function\n",
+    ")\n",
+    "\n",
+    "# Round to 2 decimal places\n",
+    "pivot_clv = pivot_clv.round(2)\n",
+    "\n",
+    "# Sort education levels in a more meaningful order (if needed)\n",
+    "edu_order = [\n",
+    "    'High School or Below',\n",
+    "    'Bachelor',\n",
+    "    'College',\n",
+    "    'Master',\n",
+    "    'Doctor'\n",
+    "]\n",
+    "\n",
+    "# Only keep and order education levels that exist in the data\n",
+    "existing_edu = [edu for edu in edu_order if edu in pivot_clv.index]\n",
+    "pivot_clv = pivot_clv.loc[existing_edu]\n",
+    "\n",
+    "print(\"Average Customer Lifetime Value by Education and Gender:\")\n",
+    "print(pivot_clv)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -146,7 +745,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -160,7 +759,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,