Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 234 additions & 15 deletions lab-dw-data-structuring-and-combining.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,120 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "492d06e3-92c7-4105-ac72-536db98d3244",
"metadata": {
"id": "492d06e3-92c7-4105-ac72-536db98d3244"
},
"outputs": [],
"execution_count": 20,
"id": "d0e9d56d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" customer st gender education customer_lifetime_value \\\n",
"0 RB50392 Washington F Master NaN \n",
"1 QZ44356 Arizona F Bachelor 697953.59 \n",
"2 AI49188 Nevada F Bachelor 1288743.17 \n",
"3 WW63253 California M Bachelor 764586.18 \n",
"4 GA49547 Washington M High School or Below 536307.65 \n",
"\n",
" income monthly_premium_auto number_of_open_complaints policy_type \\\n",
"0 0.0 1000.0 1/0/00 Personal Auto \n",
"1 0.0 94.0 1/0/00 Personal Auto \n",
"2 48767.0 108.0 1/0/00 Personal Auto \n",
"3 0.0 106.0 1/0/00 Corporate Auto \n",
"4 36357.0 68.0 1/0/00 Personal Auto \n",
"\n",
" vehicle_class total_claim_amount state \n",
"0 Four-Door Car 2.704934 NaN \n",
"1 Four-Door Car 1131.464935 NaN \n",
"2 Two-Door Car 566.472247 NaN \n",
"3 SUV 529.881344 NaN \n",
"4 Four-Door Car 17.269323 NaN \n"
]
}
],
"source": [
"# Your code goes here"
"import pandas as pd\n",
"\n",
"# Data Loader\n",
"def load_data(url):\n",
" \"\"\"Loads data from a given URL into a Pandas DataFrame.\"\"\"\n",
" df = pd.read_csv(url)\n",
" return df\n",
"\n",
"# Data Cleaner\n",
"def clean_column_names(df):\n",
" \"\"\"Cleans column names by ensuring consistency.\"\"\"\n",
" df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n",
" return df\n",
"\n",
"def replace_values(df, column, replacements):\n",
" \"\"\"Replaces specified values in a column.\"\"\"\n",
" if column in df.columns:\n",
" df[column] = df[column].replace(replacements)\n",
" return df\n",
"\n",
"# Data Formatter\n",
"def convert_to_numeric(df, column):\n",
" \"\"\"Converts a column to numeric values after cleaning its format.\"\"\"\n",
" if column in df.columns:\n",
" # Convert the column to string to allow for string operations\n",
" df[column] = df[column].astype(str).str.replace('%', '')\n",
" df[column] = pd.to_numeric(df[column], errors='coerce') # Handles conversion errors gracefully\n",
" return df\n",
"\n",
"def impute_missing_values(df, column, strategy='mode'):\n",
" \"\"\"Imputes missing values in a column using a specified strategy (e.g., 'mean', 'median', 'mode').\"\"\"\n",
" if column in df.columns:\n",
" if strategy == 'mode':\n",
" mode_value = df[column].mode()[0]\n",
" df[column] = df[column].fillna(mode_value)\n",
" # Add more strategies if needed\n",
" return df\n",
"\n",
"# Data Processor\n",
"def drop_duplicates(df):\n",
" \"\"\"Drops duplicate rows from the DataFrame.\"\"\"\n",
" return df.drop_duplicates()\n",
"\n",
"def drop_null_values(df, column):\n",
" \"\"\"Drops rows with null values in specified columns.\"\"\"\n",
" if column in df.columns:\n",
" df = df.dropna(subset=[column])\n",
" return df\n",
"\n",
"# Main Function\n",
"def main():\n",
" \"\"\"Main function to orchestrate data loading, cleaning, and processing.\"\"\"\n",
" urls = [\n",
" \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\",\n",
" \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\",\n",
" \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n",
" ]\n",
" \n",
" combined_df = pd.DataFrame()\n",
" \n",
" for url in urls:\n",
" # Load data\n",
" df = load_data(url)\n",
" \n",
" # Clean the data\n",
" df = clean_column_names(df)\n",
" df = replace_values(df, 'st', {'Cali': 'California', 'WA': 'Washington'})\n",
" df = replace_values(df, 'gender', {'Femal': 'F', 'Male': 'M'})\n",
" df = convert_to_numeric(df, 'customer_lifetime_value')\n",
" df = impute_missing_values(df, 'gender', 'mode')\n",
" df = drop_duplicates(df)\n",
" df = drop_null_values(df, 'customer')\n",
" \n",
" # Combine the DataFrame with the rest\n",
" combined_df = pd.concat([combined_df, df], ignore_index=True)\n",
" \n",
" return combined_df\n",
"\n",
"# Run the main function\n",
"if __name__ == \"__main__\":\n",
" df = main()\n",
" print(df.head())"
]
},
{
Expand Down Expand Up @@ -72,14 +178,96 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
"metadata": {
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" unnamed:_0 customer state customer_lifetime_value response \\\n",
"0 0 DK49336 Arizona 4809.216960 No \n",
"1 1 KX64629 California 2228.525238 No \n",
"2 2 LZ68649 Washington 14947.917300 No \n",
"3 3 XL78013 Oregon 22332.439460 Yes \n",
"4 4 QA50777 Oregon 9025.067525 No \n",
"\n",
" coverage education effective_to_date employmentstatus gender ... \\\n",
"0 Basic College 2011-02-18 Employed M ... \n",
"1 Basic College 2011-01-18 Unemployed F ... \n",
"2 Basic Bachelor 2011-02-10 Employed M ... \n",
"3 Extended College 2011-01-11 Employed M ... \n",
"4 Premium Bachelor 2011-01-17 Medical Leave F ... \n",
"\n",
" number_of_policies policy_type policy renew_offer_type \\\n",
"0 9 Corporate Auto Corporate L3 Offer3 \n",
"1 1 Personal Auto Personal L3 Offer4 \n",
"2 2 Personal Auto Personal L3 Offer3 \n",
"3 2 Corporate Auto Corporate L3 Offer2 \n",
"4 7 Personal Auto Personal L2 Offer1 \n",
"\n",
" sales_channel total_claim_amount vehicle_class vehicle_size \\\n",
"0 Agent 292.800000 Four-Door Car Medsize \n",
"1 Call Center 744.924331 Four-Door Car Medsize \n",
"2 Call Center 480.000000 SUV Medsize \n",
"3 Branch 484.013411 Four-Door Car Medsize \n",
"4 Branch 707.925645 Four-Door Car Medsize \n",
"\n",
" vehicle_type month \n",
"0 A 2 \n",
"1 A 1 \n",
"2 A 2 \n",
"3 A 1 \n",
"4 A 1 \n",
"\n",
"[5 rows x 27 columns]\n",
"Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n",
" 'response', 'coverage', 'education', 'effective_to_date',\n",
" 'employmentstatus', 'gender', 'income', 'location_code',\n",
" 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n",
" 'months_since_policy_inception', 'number_of_open_complaints',\n",
" 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n",
" 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n",
" 'vehicle_type', 'month'],\n",
" dtype='object')\n",
" total_claim_amount\n",
"sales_channel \n",
"Agent 1810226.82\n",
"Branch 1301204.00\n",
"Call Center 926600.82\n",
"Web 706600.04\n"
]
}
],
"source": [
"# Your code goes here"
"import pandas as pd\n",
"\n",
"# Load the dataset\n",
"url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
"df = pd.read_csv(url)\n",
"\n",
"# Display first few rows to understand the data structure\n",
"print(df.head())\n",
"\n",
"# Check the columns in the DataFrame\n",
"print(df.columns)\n",
"\n",
"# Create pivot table to summarize total revenue by sales channel\n",
"pivot_table = df.pivot_table(values='total_claim_amount', \n",
" index='sales_channel', \n",
" aggfunc='sum')\n",
"\n",
"# Round the total revenue to 2 decimal places\n",
"pivot_table = pivot_table.round(2)\n",
"\n",
"# Sort the pivot table to see which sales channel has the highest revenue\n",
"pivot_table = pivot_table.sort_values(by='total_claim_amount', ascending=False)\n",
"\n",
"# Display the pivot table\n",
"print(pivot_table)"
]
},
{
Expand Down Expand Up @@ -130,14 +318,45 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"id": "3a069e0b-b400-470e-904d-d17582191be4",
"metadata": {
"id": "3a069e0b-b400-470e-904d-d17582191be4"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" month policy_type number_of_complaints\n",
"0 1 Corporate Auto 1252\n",
"1 1 Personal Auto 4329\n",
"2 1 Special Auto 237\n",
"3 2 Corporate Auto 1089\n",
"4 2 Personal Auto 3799\n",
"5 2 Special Auto 204\n"
]
}
],
"source": [
"# Your code goes here"
"import pandas as pd\n",
"\n",
"# Load the dataset\n",
"url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
"df = pd.read_csv(url)\n",
"\n",
"# Convert date column if necessary to extract month information\n",
"df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n",
"\n",
"# Extract month information from the date column\n",
"df['month'] = df['effective_to_date'].dt.month\n",
"\n",
"# Assuming 'complaints' is a placeholder for actual complaint records:\n",
"# Count the number of complaints by policy type and month\n",
"complaints_summary = df.groupby(['month', 'policy_type']).size().reset_index(name='number_of_complaints')\n",
"\n",
"# Showcase in long format\n",
"print(complaints_summary)"
]
}
],
Expand All @@ -146,7 +365,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -160,7 +379,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down