data-bootcamp-v4 · MBengochea · Sep 12, 2025
diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,120 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "492d06e3-92c7-4105-ac72-536db98d3244",
-   "metadata": {
-    "id": "492d06e3-92c7-4105-ac72-536db98d3244"
-   },
-   "outputs": [],
+   "execution_count": 20,
+   "id": "d0e9d56d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  customer          st gender             education  customer_lifetime_value  \\\n",
+      "0  RB50392  Washington      F                Master                      NaN   \n",
+      "1  QZ44356     Arizona      F              Bachelor                697953.59   \n",
+      "2  AI49188      Nevada      F              Bachelor               1288743.17   \n",
+      "3  WW63253  California      M              Bachelor                764586.18   \n",
+      "4  GA49547  Washington      M  High School or Below                536307.65   \n",
+      "\n",
+      "    income  monthly_premium_auto number_of_open_complaints     policy_type  \\\n",
+      "0      0.0                1000.0                    1/0/00   Personal Auto   \n",
+      "1      0.0                  94.0                    1/0/00   Personal Auto   \n",
+      "2  48767.0                 108.0                    1/0/00   Personal Auto   \n",
+      "3      0.0                 106.0                    1/0/00  Corporate Auto   \n",
+      "4  36357.0                  68.0                    1/0/00   Personal Auto   \n",
+      "\n",
+      "   vehicle_class  total_claim_amount state  \n",
+      "0  Four-Door Car            2.704934   NaN  \n",
+      "1  Four-Door Car         1131.464935   NaN  \n",
+      "2   Two-Door Car          566.472247   NaN  \n",
+      "3            SUV          529.881344   NaN  \n",
+      "4  Four-Door Car           17.269323   NaN  \n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "\n",
+    "# Data Loader\n",
+    "def load_data(url):\n",
+    "    \"\"\"Loads data from a given URL into a Pandas DataFrame.\"\"\"\n",
+    "    df = pd.read_csv(url)\n",
+    "    return df\n",
+    "\n",
+    "# Data Cleaner\n",
+    "def clean_column_names(df):\n",
+    "    \"\"\"Cleans column names by ensuring consistency.\"\"\"\n",
+    "    df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n",
+    "    return df\n",
+    "\n",
+    "def replace_values(df, column, replacements):\n",
+    "    \"\"\"Replaces specified values in a column.\"\"\"\n",
+    "    if column in df.columns:\n",
+    "        df[column] = df[column].replace(replacements)\n",
+    "    return df\n",
+    "\n",
+    "# Data Formatter\n",
+    "def convert_to_numeric(df, column):\n",
+    "    \"\"\"Converts a column to numeric values after cleaning its format.\"\"\"\n",
+    "    if column in df.columns:\n",
+    "        # Convert the column to string to allow for string operations\n",
+    "        df[column] = df[column].astype(str).str.replace('%', '')\n",
+    "        df[column] = pd.to_numeric(df[column], errors='coerce')  # Handles conversion errors gracefully\n",
+    "    return df\n",
+    "\n",
+    "def impute_missing_values(df, column, strategy='mode'):\n",
+    "    \"\"\"Imputes missing values in a column using a specified strategy (e.g., 'mean', 'median', 'mode').\"\"\"\n",
+    "    if column in df.columns:\n",
+    "        if strategy == 'mode':\n",
+    "            mode_value = df[column].mode()[0]\n",
+    "            df[column] = df[column].fillna(mode_value)\n",
+    "        # Add more strategies if needed\n",
+    "    return df\n",
+    "\n",
+    "# Data Processor\n",
+    "def drop_duplicates(df):\n",
+    "    \"\"\"Drops duplicate rows from the DataFrame.\"\"\"\n",
+    "    return df.drop_duplicates()\n",
+    "\n",
+    "def drop_null_values(df, column):\n",
+    "    \"\"\"Drops rows with null values in specified columns.\"\"\"\n",
+    "    if column in df.columns:\n",
+    "        df = df.dropna(subset=[column])\n",
+    "    return df\n",
+    "\n",
+    "# Main Function\n",
+    "def main():\n",
+    "    \"\"\"Main function to orchestrate data loading, cleaning, and processing.\"\"\"\n",
+    "    urls = [\n",
+    "        \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\",\n",
+    "        \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\",\n",
+    "        \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n",
+    "    ]\n",
+    "    \n",
+    "    combined_df = pd.DataFrame()\n",
+    "    \n",
+    "    for url in urls:\n",
+    "        # Load data\n",
+    "        df = load_data(url)\n",
+    "        \n",
+    "        # Clean the data\n",
+    "        df = clean_column_names(df)\n",
+    "        df = replace_values(df, 'st', {'Cali': 'California', 'WA': 'Washington'})\n",
+    "        df = replace_values(df, 'gender', {'Femal': 'F', 'Male': 'M'})\n",
+    "        df = convert_to_numeric(df, 'customer_lifetime_value')\n",
+    "        df = impute_missing_values(df, 'gender', 'mode')\n",
+    "        df = drop_duplicates(df)\n",
+    "        df = drop_null_values(df, 'customer')\n",
+    "        \n",
+    "        # Combine the DataFrame with the rest\n",
+    "        combined_df = pd.concat([combined_df, df], ignore_index=True)\n",
+    "    \n",
+    "    return combined_df\n",
+    "\n",
+    "# Run the main function\n",
+    "if __name__ == \"__main__\":\n",
+    "    df = main()\n",
+    "    print(df.head())"
    ]
   },
   {
@@ -72,14 +178,96 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   unnamed:_0 customer       state  customer_lifetime_value response  \\\n",
+      "0           0  DK49336     Arizona              4809.216960       No   \n",
+      "1           1  KX64629  California              2228.525238       No   \n",
+      "2           2  LZ68649  Washington             14947.917300       No   \n",
+      "3           3  XL78013      Oregon             22332.439460      Yes   \n",
+      "4           4  QA50777      Oregon              9025.067525       No   \n",
+      "\n",
+      "   coverage education effective_to_date employmentstatus gender  ...  \\\n",
+      "0     Basic   College        2011-02-18         Employed      M  ...   \n",
+      "1     Basic   College        2011-01-18       Unemployed      F  ...   \n",
+      "2     Basic  Bachelor        2011-02-10         Employed      M  ...   \n",
+      "3  Extended   College        2011-01-11         Employed      M  ...   \n",
+      "4   Premium  Bachelor        2011-01-17    Medical Leave      F  ...   \n",
+      "\n",
+      "   number_of_policies     policy_type        policy  renew_offer_type  \\\n",
+      "0                   9  Corporate Auto  Corporate L3            Offer3   \n",
+      "1                   1   Personal Auto   Personal L3            Offer4   \n",
+      "2                   2   Personal Auto   Personal L3            Offer3   \n",
+      "3                   2  Corporate Auto  Corporate L3            Offer2   \n",
+      "4                   7   Personal Auto   Personal L2            Offer1   \n",
+      "\n",
+      "   sales_channel  total_claim_amount  vehicle_class  vehicle_size  \\\n",
+      "0          Agent          292.800000  Four-Door Car       Medsize   \n",
+      "1    Call Center          744.924331  Four-Door Car       Medsize   \n",
+      "2    Call Center          480.000000            SUV       Medsize   \n",
+      "3         Branch          484.013411  Four-Door Car       Medsize   \n",
+      "4         Branch          707.925645  Four-Door Car       Medsize   \n",
+      "\n",
+      "  vehicle_type month  \n",
+      "0            A     2  \n",
+      "1            A     1  \n",
+      "2            A     2  \n",
+      "3            A     1  \n",
+      "4            A     1  \n",
+      "\n",
+      "[5 rows x 27 columns]\n",
+      "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n",
+      "       'response', 'coverage', 'education', 'effective_to_date',\n",
+      "       'employmentstatus', 'gender', 'income', 'location_code',\n",
+      "       'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n",
+      "       'months_since_policy_inception', 'number_of_open_complaints',\n",
+      "       'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n",
+      "       'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n",
+      "       'vehicle_type', 'month'],\n",
+      "      dtype='object')\n",
+      "               total_claim_amount\n",
+      "sales_channel                    \n",
+      "Agent                  1810226.82\n",
+      "Branch                 1301204.00\n",
+      "Call Center             926600.82\n",
+      "Web                     706600.04\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "\n",
+    "# Load the dataset\n",
+    "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+    "df = pd.read_csv(url)\n",
+    "\n",
+    "# Display first few rows to understand the data structure\n",
+    "print(df.head())\n",
+    "\n",
+    "# Check the columns in the DataFrame\n",
+    "print(df.columns)\n",
+    "\n",
+    "# Create pivot table to summarize total revenue by sales channel\n",
+    "pivot_table = df.pivot_table(values='total_claim_amount', \n",
+    "                             index='sales_channel', \n",
+    "                             aggfunc='sum')\n",
+    "\n",
+    "# Round the total revenue to 2 decimal places\n",
+    "pivot_table = pivot_table.round(2)\n",
+    "\n",
+    "# Sort the pivot table to see which sales channel has the highest revenue\n",
+    "pivot_table = pivot_table.sort_values(by='total_claim_amount', ascending=False)\n",
+    "\n",
+    "# Display the pivot table\n",
+    "print(pivot_table)"
    ]
   },
   {
@@ -130,14 +318,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "3a069e0b-b400-470e-904d-d17582191be4",
    "metadata": {
     "id": "3a069e0b-b400-470e-904d-d17582191be4"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   month     policy_type  number_of_complaints\n",
+      "0      1  Corporate Auto                  1252\n",
+      "1      1   Personal Auto                  4329\n",
+      "2      1    Special Auto                   237\n",
+      "3      2  Corporate Auto                  1089\n",
+      "4      2   Personal Auto                  3799\n",
+      "5      2    Special Auto                   204\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "\n",
+    "# Load the dataset\n",
+    "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+    "df = pd.read_csv(url)\n",
+    "\n",
+    "# Convert date column if necessary to extract month information\n",
+    "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n",
+    "\n",
+    "# Extract month information from the date column\n",
+    "df['month'] = df['effective_to_date'].dt.month\n",
+    "\n",
+    "# Assuming 'complaints' is a placeholder for actual complaint records:\n",
+    "# Count the number of complaints by policy type and month\n",
+    "complaints_summary = df.groupby(['month', 'policy_type']).size().reset_index(name='number_of_complaints')\n",
+    "\n",
+    "# Showcase in long format\n",
+    "print(complaints_summary)"
    ]
   }
  ],
@@ -146,7 +365,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -160,7 +379,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,