Skip to content
Open

lab #477

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12,075 changes: 12,075 additions & 0 deletions combined_cleaned_customer_data.csv

Large diffs are not rendered by default.

4,009 changes: 4,009 additions & 0 deletions file1.csv

Large diffs are not rendered by default.

997 changes: 997 additions & 0 deletions file2.csv

Large diffs are not rendered by default.

4,009 changes: 4,009 additions & 0 deletions file3.csv

Large diffs are not rendered by default.

997 changes: 997 additions & 0 deletions file4.csv

Large diffs are not rendered by default.

7,071 changes: 7,071 additions & 0 deletions file5.csv

Large diffs are not rendered by default.

193 changes: 184 additions & 9 deletions lab-dw-data-structuring-and-combining.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,88 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "492d06e3-92c7-4105-ac72-536db98d3244",
"metadata": {
"id": "492d06e3-92c7-4105-ac72-536db98d3244"
},
"outputs": [],
"source": [
"# Your code goes here"
"import pandas as pd\n",
"\n",
"def clean_customer_data(df):\n",
" # Standardize column names\n",
" df.columns = (\n",
" df.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n",
" )\n",
" \n",
" # Fix gender\n",
" if 'gender' in df.columns:\n",
" df['gender'] = df['gender'].str.strip().str.lower().replace({\n",
" 'femal': 'f', 'female': 'f', 'f': 'f',\n",
" 'male': 'm', 'm': 'm'\n",
" }).str.upper()\n",
" \n",
" # Fix state\n",
" if 'state' in df.columns:\n",
" df['state'] = df['state'].replace({\n",
" 'AZ': 'Arizona', 'Cali': 'California', 'WA': 'Washington'\n",
" })\n",
" \n",
" # Fix education\n",
" if 'education' in df.columns:\n",
" df['education'] = df['education'].replace({'Bachelors': 'Bachelor'})\n",
"\n",
" # Fix customer lifetime value\n",
" if 'customer_lifetime_value' in df.columns:\n",
" df['customer_lifetime_value'] = (\n",
" df['customer_lifetime_value']\n",
" .astype(str)\n",
" .str.replace('%', '', regex=False)\n",
" )\n",
" df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce')\n",
"\n",
" # Fix vehicle class\n",
" if 'vehicle_class' in df.columns:\n",
" df['vehicle_class'] = df['vehicle_class'].replace({\n",
" 'Luxury SUV': 'Luxury', 'Luxury Car': 'Luxury', 'Sports Car': 'Luxury'\n",
" })\n",
"\n",
" # Fix number_of_open_complaints\n",
" if 'number_of_open_complaints' in df.columns:\n",
" df['number_of_open_complaints'] = (\n",
" df['number_of_open_complaints']\n",
" .astype(str)\n",
" .str.split(\"/\")\n",
" .str[1]\n",
" )\n",
" df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce')\n",
" \n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "23f88997",
"metadata": {},
"outputs": [],
"source": [
"# Load the files\n",
"file3 = pd.read_csv(\"file3.csv\")\n",
"file4 = pd.read_csv(\"file4.csv\")\n",
"file5 = pd.read_csv(\"file5.csv\")\n",
"\n",
"# Clean each dataset\n",
"file3_clean = clean_customer_data(file3)\n",
"file4_clean = clean_customer_data(file4)\n",
"file5_clean = clean_customer_data(file5)\n",
"\n",
"# Combine into one DataFrame\n",
"combined_df = pd.concat([file3_clean, file4_clean, file5_clean], ignore_index=True)\n",
"\n",
"# Optional: save the cleaned data\n",
"combined_df.to_csv(\"combined_cleaned_customer_data.csv\", index=False)\n"
]
},
{
Expand Down Expand Up @@ -72,14 +146,77 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
"metadata": {
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
},
"outputs": [],
"source": [
"# Your code goes here"
"marketing_df = pd.read_csv(\"marketing_customer_analysis_clean.csv\")\n",
"marketing_df.columns = marketing_df.columns.str.strip().str.lower().str.replace(\" \", \"_\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a621cb3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Total Revenue by Sales Channel:\n",
" total_claim_amount\n",
"sales_channel \n",
"Agent 1810226.82\n",
"Branch 1301204.00\n",
"Call Center 926600.82\n",
"Web 706600.04\n"
]
}
],
"source": [
"revenue_by_channel = marketing_df.pivot_table(\n",
" index='sales_channel',\n",
" values='total_claim_amount',\n",
" aggfunc='sum'\n",
").round(2)\n",
"\n",
"print(\" Total Revenue by Sales Channel:\")\n",
"print(revenue_by_channel)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f1a1c67c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Average Customer Lifetime Value by Gender & Education:\n",
"education Bachelor College Doctor High School or Below Master\n",
"gender \n",
"F 7874.27 7748.82 7328.51 8675.22 8157.05\n",
"M 7703.60 8052.46 7415.33 8149.69 8168.83\n"
]
}
],
"source": [
"clv_by_gender_education = marketing_df.pivot_table(\n",
" index='gender',\n",
" columns='education',\n",
" values='customer_lifetime_value',\n",
" aggfunc='mean'\n",
").round(2)\n",
"\n",
"print(\"\\n Average Customer Lifetime Value by Gender & Education:\")\n",
"print(clv_by_gender_education)\n"
]
},
{
Expand Down Expand Up @@ -130,23 +267,61 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "3a069e0b-b400-470e-904d-d17582191be4",
"metadata": {
"id": "3a069e0b-b400-470e-904d-d17582191be4"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Complaints by Policy Type and Month (Long Format):\n",
" policy_type month total_complaints\n",
"0 Corporate Auto February 385.208135\n",
"1 Corporate Auto January 443.434952\n",
"2 Personal Auto February 1453.684441\n",
"3 Personal Auto January 1727.605722\n",
"4 Special Auto February 95.226817\n"
]
}
],
"source": [
"# Your code goes here"
"# Your code goes here\n",
"# Ensure date column is in datetime format\n",
"marketing_df['effective_to_date'] = pd.to_datetime(marketing_df['effective_to_date'], errors='coerce')\n",
"\n",
"# Create 'month' column\n",
"marketing_df['month'] = marketing_df['effective_to_date'].dt.month_name()\n",
"\n",
"# Group and reshape in long format\n",
"complaints_summary = (\n",
" marketing_df.groupby(['policy_type', 'month'])['number_of_open_complaints']\n",
" .sum()\n",
" .reset_index(name='total_complaints')\n",
")\n",
"\n",
"print(\"\\n Complaints by Policy Type and Month (Long Format):\")\n",
"print(complaints_summary.head())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b23e0cc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -160,7 +335,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
Loading