data-bootcamp-v4 · ruchikagogia1799 · Aug 3, 2025 · Aug 3, 2025
diff --git a/combined_cleaned_customer_data.csv b/combined_cleaned_customer_data.csv
diff --git a/file1.csv b/file1.csv
diff --git a/file2.csv b/file2.csv
diff --git a/file3.csv b/file3.csv
diff --git a/file4.csv b/file4.csv
diff --git a/file5.csv b/file5.csv
diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,88 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "492d06e3-92c7-4105-ac72-536db98d3244",
    "metadata": {
     "id": "492d06e3-92c7-4105-ac72-536db98d3244"
    },
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "import pandas as pd\n",
+    "\n",
+    "def clean_customer_data(df):\n",
+    "    # Standardize column names\n",
+    "    df.columns = (\n",
+    "        df.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n",
+    "    )\n",
+    "    \n",
+    "    # Fix gender\n",
+    "    if 'gender' in df.columns:\n",
+    "        df['gender'] = df['gender'].str.strip().str.lower().replace({\n",
+    "            'femal': 'f', 'female': 'f', 'f': 'f',\n",
+    "            'male': 'm', 'm': 'm'\n",
+    "        }).str.upper()\n",
+    "    \n",
+    "    # Fix state\n",
+    "    if 'state' in df.columns:\n",
+    "        df['state'] = df['state'].replace({\n",
+    "            'AZ': 'Arizona', 'Cali': 'California', 'WA': 'Washington'\n",
+    "        })\n",
+    "    \n",
+    "    # Fix education\n",
+    "    if 'education' in df.columns:\n",
+    "        df['education'] = df['education'].replace({'Bachelors': 'Bachelor'})\n",
+    "\n",
+    "    # Fix customer lifetime value\n",
+    "    if 'customer_lifetime_value' in df.columns:\n",
+    "        df['customer_lifetime_value'] = (\n",
+    "            df['customer_lifetime_value']\n",
+    "            .astype(str)\n",
+    "            .str.replace('%', '', regex=False)\n",
+    "        )\n",
+    "        df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce')\n",
+    "\n",
+    "    # Fix vehicle class\n",
+    "    if 'vehicle_class' in df.columns:\n",
+    "        df['vehicle_class'] = df['vehicle_class'].replace({\n",
+    "            'Luxury SUV': 'Luxury', 'Luxury Car': 'Luxury', 'Sports Car': 'Luxury'\n",
+    "        })\n",
+    "\n",
+    "    # Fix number_of_open_complaints\n",
+    "    if 'number_of_open_complaints' in df.columns:\n",
+    "        df['number_of_open_complaints'] = (\n",
+    "            df['number_of_open_complaints']\n",
+    "            .astype(str)\n",
+    "            .str.split(\"/\")\n",
+    "            .str[1]\n",
+    "        )\n",
+    "        df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce')\n",
+    "    \n",
+    "    return df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "23f88997",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the files\n",
+    "file3 = pd.read_csv(\"file3.csv\")\n",
+    "file4 = pd.read_csv(\"file4.csv\")\n",
+    "file5 = pd.read_csv(\"file5.csv\")\n",
+    "\n",
+    "# Clean each dataset\n",
+    "file3_clean = clean_customer_data(file3)\n",
+    "file4_clean = clean_customer_data(file4)\n",
+    "file5_clean = clean_customer_data(file5)\n",
+    "\n",
+    "# Combine into one DataFrame\n",
+    "combined_df = pd.concat([file3_clean, file4_clean, file5_clean], ignore_index=True)\n",
+    "\n",
+    "# Optional: save the cleaned data\n",
+    "combined_df.to_csv(\"combined_cleaned_customer_data.csv\", index=False)\n"
    ]
   },
   {
@@ -72,14 +146,77 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
    "metadata": {
     "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
    },
    "outputs": [],
    "source": [
-    "# Your code goes here"
+    "marketing_df = pd.read_csv(\"marketing_customer_analysis_clean.csv\")\n",
+    "marketing_df.columns = marketing_df.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a621cb3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Total Revenue by Sales Channel:\n",
+      "               total_claim_amount\n",
+      "sales_channel                    \n",
+      "Agent                  1810226.82\n",
+      "Branch                 1301204.00\n",
+      "Call Center             926600.82\n",
+      "Web                     706600.04\n"
+     ]
+    }
+   ],
+   "source": [
+    "revenue_by_channel = marketing_df.pivot_table(\n",
+    "    index='sales_channel',\n",
+    "    values='total_claim_amount',\n",
+    "    aggfunc='sum'\n",
+    ").round(2)\n",
+    "\n",
+    "print(\" Total Revenue by Sales Channel:\")\n",
+    "print(revenue_by_channel)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f1a1c67c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " Average Customer Lifetime Value by Gender & Education:\n",
+      "education  Bachelor  College   Doctor  High School or Below   Master\n",
+      "gender                                                              \n",
+      "F           7874.27  7748.82  7328.51               8675.22  8157.05\n",
+      "M           7703.60  8052.46  7415.33               8149.69  8168.83\n"
+     ]
+    }
+   ],
+   "source": [
+    "clv_by_gender_education = marketing_df.pivot_table(\n",
+    "    index='gender',\n",
+    "    columns='education',\n",
+    "    values='customer_lifetime_value',\n",
+    "    aggfunc='mean'\n",
+    ").round(2)\n",
+    "\n",
+    "print(\"\\n Average Customer Lifetime Value by Gender & Education:\")\n",
+    "print(clv_by_gender_education)\n"
    ]
   },
   {
@@ -130,23 +267,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "3a069e0b-b400-470e-904d-d17582191be4",
    "metadata": {
     "id": "3a069e0b-b400-470e-904d-d17582191be4"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " Complaints by Policy Type and Month (Long Format):\n",
+      "      policy_type     month  total_complaints\n",
+      "0  Corporate Auto  February        385.208135\n",
+      "1  Corporate Auto   January        443.434952\n",
+      "2   Personal Auto  February       1453.684441\n",
+      "3   Personal Auto   January       1727.605722\n",
+      "4    Special Auto  February         95.226817\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code goes here"
+    "# Your code goes here\n",
+    "# Ensure date column is in datetime format\n",
+    "marketing_df['effective_to_date'] = pd.to_datetime(marketing_df['effective_to_date'], errors='coerce')\n",
+    "\n",
+    "# Create 'month' column\n",
+    "marketing_df['month'] = marketing_df['effective_to_date'].dt.month_name()\n",
+    "\n",
+    "# Group and reshape in long format\n",
+    "complaints_summary = (\n",
+    "    marketing_df.groupby(['policy_type', 'month'])['number_of_open_complaints']\n",
+    "    .sum()\n",
+    "    .reset_index(name='total_complaints')\n",
+    ")\n",
+    "\n",
+    "print(\"\\n Complaints by Policy Type and Month (Long Format):\")\n",
+    "print(complaints_summary.head())\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b23e0cc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "colab": {
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -160,7 +335,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,