diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..57e524f 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,120 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "492d06e3-92c7-4105-ac72-536db98d3244", - "metadata": { - "id": "492d06e3-92c7-4105-ac72-536db98d3244" - }, - "outputs": [], + "execution_count": 20, + "id": "d0e9d56d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " customer st gender education customer_lifetime_value \\\n", + "0 RB50392 Washington F Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59 \n", + "2 AI49188 Nevada F Bachelor 1288743.17 \n", + "3 WW63253 California M Bachelor 764586.18 \n", + "4 GA49547 Washington M High School or Below 536307.65 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount state \n", + "0 Four-Door Car 2.704934 NaN \n", + "1 Four-Door Car 1131.464935 NaN \n", + "2 Two-Door Car 566.472247 NaN \n", + "3 SUV 529.881344 NaN \n", + "4 Four-Door Car 17.269323 NaN \n" + ] + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "\n", + "# Data Loader\n", + "def load_data(url):\n", + " \"\"\"Loads data from a given URL into a Pandas DataFrame.\"\"\"\n", + " df = pd.read_csv(url)\n", + " return df\n", + "\n", + "# Data Cleaner\n", + "def clean_column_names(df):\n", + " \"\"\"Cleans column names by ensuring consistency.\"\"\"\n", + " df.columns = [col.lower().replace(' ', '_') for col in df.columns]\n", + " return df\n", + "\n", + "def replace_values(df, column, replacements):\n", + " \"\"\"Replaces specified values in a column.\"\"\"\n", + " if column in df.columns:\n", + " df[column] = df[column].replace(replacements)\n", + " return df\n", + "\n", + "# Data Formatter\n", + "def convert_to_numeric(df, column):\n", + " \"\"\"Converts a column to numeric values after cleaning its format.\"\"\"\n", + " if column in df.columns:\n", + " # Convert the column to string to allow for string operations\n", + " df[column] = df[column].astype(str).str.replace('%', '')\n", + " df[column] = pd.to_numeric(df[column], errors='coerce') # Handles conversion errors gracefully\n", + " return df\n", + "\n", + "def impute_missing_values(df, column, strategy='mode'):\n", + " \"\"\"Imputes missing values in a column using a specified strategy (e.g., 'mean', 'median', 'mode').\"\"\"\n", + " if column in df.columns:\n", + " if strategy == 'mode':\n", + " mode_value = df[column].mode()[0]\n", + " df[column] = df[column].fillna(mode_value)\n", + " # Add more strategies if needed\n", + " return df\n", + "\n", + "# Data Processor\n", + "def drop_duplicates(df):\n", + " \"\"\"Drops duplicate rows from the DataFrame.\"\"\"\n", + " return df.drop_duplicates()\n", + "\n", + "def drop_null_values(df, column):\n", + " \"\"\"Drops rows with null values in specified columns.\"\"\"\n", + " if column in df.columns:\n", + " df = df.dropna(subset=[column])\n", + " return df\n", + "\n", + "# Main Function\n", + "def main():\n", + " \"\"\"Main function to orchestrate data loading, cleaning, and processing.\"\"\"\n", + " urls = [\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\",\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\",\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + " ]\n", + " \n", + " combined_df = pd.DataFrame()\n", + " \n", + " for url in urls:\n", + " # Load data\n", + " df = load_data(url)\n", + " \n", + " # Clean the data\n", + " df = clean_column_names(df)\n", + " df = replace_values(df, 'st', {'Cali': 'California', 'WA': 'Washington'})\n", + " df = replace_values(df, 'gender', {'Femal': 'F', 'Male': 'M'})\n", + " df = convert_to_numeric(df, 'customer_lifetime_value')\n", + " df = impute_missing_values(df, 'gender', 'mode')\n", + " df = drop_duplicates(df)\n", + " df = drop_null_values(df, 'customer')\n", + " \n", + " # Combine the DataFrame with the rest\n", + " combined_df = pd.concat([combined_df, df], ignore_index=True)\n", + " \n", + " return combined_df\n", + "\n", + "# Run the main function\n", + "if __name__ == \"__main__\":\n", + " df = main()\n", + " print(df.head())" ] }, { @@ -72,14 +178,96 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]\n", + "Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',\n", + " 'response', 'coverage', 'education', 'effective_to_date',\n", + " 'employmentstatus', 'gender', 'income', 'location_code',\n", + " 'marital_status', 'monthly_premium_auto', 'months_since_last_claim',\n", + " 'months_since_policy_inception', 'number_of_open_complaints',\n", + " 'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',\n", + " 'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',\n", + " 'vehicle_type', 'month'],\n", + " dtype='object')\n", + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04\n" + ] + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "\n", + "# Load the dataset\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "# Display first few rows to understand the data structure\n", + "print(df.head())\n", + "\n", + "# Check the columns in the DataFrame\n", + "print(df.columns)\n", + "\n", + "# Create pivot table to summarize total revenue by sales channel\n", + "pivot_table = df.pivot_table(values='total_claim_amount', \n", + " index='sales_channel', \n", + " aggfunc='sum')\n", + "\n", + "# Round the total revenue to 2 decimal places\n", + "pivot_table = pivot_table.round(2)\n", + "\n", + "# Sort the pivot table to see which sales channel has the highest revenue\n", + "pivot_table = pivot_table.sort_values(by='total_claim_amount', ascending=False)\n", + "\n", + "# Display the pivot table\n", + "print(pivot_table)" ] }, { @@ -130,14 +318,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " month policy_type number_of_complaints\n", + "0 1 Corporate Auto 1252\n", + "1 1 Personal Auto 4329\n", + "2 1 Special Auto 237\n", + "3 2 Corporate Auto 1089\n", + "4 2 Personal Auto 3799\n", + "5 2 Special Auto 204\n" + ] + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "\n", + "# Load the dataset\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)\n", + "\n", + "# Convert date column if necessary to extract month information\n", + "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])\n", + "\n", + "# Extract month information from the date column\n", + "df['month'] = df['effective_to_date'].dt.month\n", + "\n", + "# Assuming 'complaints' is a placeholder for actual complaint records:\n", + "# Count the number of complaints by policy type and month\n", + "complaints_summary = df.groupby(['month', 'policy_type']).size().reset_index(name='number_of_complaints')\n", + "\n", + "# Showcase in long format\n", + "print(complaints_summary)" ] } ], @@ -146,7 +365,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +379,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,