diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..829b4f8 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -34,16 +34,128 @@ "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb0e433d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "#Cleaned file\n", + "df1 = pd.read_csv(\"cleaned_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "077e47aa", + "metadata": {}, + "outputs": [], + "source": [ + "#2 others\n", + "\n", + "url2 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n", + "url3 = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n", + "\n", + "df2 = pd.read_csv(url2)\n", + "df3 = pd.read_csv(url3)\n", + "\n", + "# Combine for cleaning\n", + "df_new = pd.concat([df2, df3], ignore_index=True)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "492d06e3-92c7-4105-ac72-536db98d3244", - "metadata": { - "id": "492d06e3-92c7-4105-ac72-536db98d3244" - }, + "id": "dc7b2188", + "metadata": {}, "outputs": [], "source": [ - "# Your code goes here" + "# Clean\n", + "\n", + "df_new.columns = [col.lower() for col in df_new.columns]\n", + "df_new.columns = [col.replace(\" \", \"_\") for col in df_new.columns]\n", + "\n", + "df_new = df_new.rename(columns={\"st\":\"state\"})\n", + "\n", + "# Standardize Gender\n", + "df_new[\"gender\"] = df_new[\"gender\"].str.strip().str.upper().map({\n", + " 'F': 'F', 'FEMALE': 'F', 'FEMAL': 'F',\n", + " 'M': 'M', 'MALE': 'M'\n", + "})\n", + "\n", + "# State Abbreviations\n", + "state_map = {'AZ': 'Arizona', 'Cali': 'California', 'WA': 'Washington'}\n", + "df_new['state'] = df_new['state'].replace(state_map)\n", + "\n", + "# Education\n", + "df_new['education'] = df_new['education'].replace({'Bachelors': 'Bachelor'})\n", + "\n", + "# Customer Lifetime Value\n", + "df_new['customer_lifetime_value'] = (\n", + " df_new['customer_lifetime_value']\n", + " .astype(str)\n", + " .str.replace('%', '', regex=False)\n", + " .astype(float)\n", + ")\n", + "\n", + "# Vehicle Class\n", + "df_new['vehicle_class'] = df_new['vehicle_class'].replace({\n", + " 'Sports Car': 'Luxury',\n", + " 'Luxury SUV': 'Luxury',\n", + " 'Luxury Car': 'Luxury'\n", + "})\n", + "\n", + "# Number of Open Complaints\n", + "df_new['number_of_open_complaints'] = (\n", + " df_new['number_of_open_complaints']\n", + " .astype(str)\n", + " .apply(lambda x: int(x.split('/')[1]) if '/' in x else pd.NA)\n", + " .astype('Int64')\n", + ")\n", + "\n", + "# Null Values\n", + "num_cols = df_new.select_dtypes(include=['float', 'int', 'Int64']).columns\n", + "cat_cols = df_new.select_dtypes(include=['object', 'category']).columns\n", + "\n", + "for col in num_cols:\n", + " df_new[col] = df_new[col].fillna(df_new[col].median())\n", + "\n", + "for col in cat_cols:\n", + " df_new[col] = df_new[col].fillna(df_new[col].mode()[0])\n", + "\n", + "# Convert numerics to int\n", + "for col in num_cols:\n", + " df_new[col] = df_new[col].astype(int)\n", + "\n", + "# Drop Duplicates\n", + "df_new = df_new.drop_duplicates().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fca3be4", + "metadata": {}, + "outputs": [], + "source": [ + "# Combine with cleaned df1\n", + "df_combined = pd.concat([df1, df_new], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77a456eb", + "metadata": {}, + "outputs": [], + "source": [ + "df_combined = df_combined.drop_duplicates().reset_index(drop=True)\n", + "\n", + "# Save cleaned and combined dataset\n", + "df_combined.to_csv(\"final_cleaned_data.csv\", index=False)" ] }, { @@ -72,14 +184,404 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | unnamed:_0 | \n", + "customer | \n", + "state | \n", + "customer_lifetime_value | \n", + "response | \n", + "coverage | \n", + "education | \n", + "effective_to_date | \n", + "employmentstatus | \n", + "gender | \n", + "... | \n", + "number_of_policies | \n", + "policy_type | \n", + "policy | \n", + "renew_offer_type | \n", + "sales_channel | \n", + "total_claim_amount | \n", + "vehicle_class | \n", + "vehicle_size | \n", + "vehicle_type | \n", + "month | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "DK49336 | \n", + "Arizona | \n", + "4809.216960 | \n", + "No | \n", + "Basic | \n", + "College | \n", + "2011-02-18 | \n", + "Employed | \n", + "M | \n", + "... | \n", + "9 | \n", + "Corporate Auto | \n", + "Corporate L3 | \n", + "Offer3 | \n", + "Agent | \n", + "292.800000 | \n", + "Four-Door Car | \n", + "Medsize | \n", + "A | \n", + "2 | \n", + "
1 | \n", + "1 | \n", + "KX64629 | \n", + "California | \n", + "2228.525238 | \n", + "No | \n", + "Basic | \n", + "College | \n", + "2011-01-18 | \n", + "Unemployed | \n", + "F | \n", + "... | \n", + "1 | \n", + "Personal Auto | \n", + "Personal L3 | \n", + "Offer4 | \n", + "Call Center | \n", + "744.924331 | \n", + "Four-Door Car | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
2 | \n", + "2 | \n", + "LZ68649 | \n", + "Washington | \n", + "14947.917300 | \n", + "No | \n", + "Basic | \n", + "Bachelor | \n", + "2011-02-10 | \n", + "Employed | \n", + "M | \n", + "... | \n", + "2 | \n", + "Personal Auto | \n", + "Personal L3 | \n", + "Offer3 | \n", + "Call Center | \n", + "480.000000 | \n", + "SUV | \n", + "Medsize | \n", + "A | \n", + "2 | \n", + "
3 | \n", + "3 | \n", + "XL78013 | \n", + "Oregon | \n", + "22332.439460 | \n", + "Yes | \n", + "Extended | \n", + "College | \n", + "2011-01-11 | \n", + "Employed | \n", + "M | \n", + "... | \n", + "2 | \n", + "Corporate Auto | \n", + "Corporate L3 | \n", + "Offer2 | \n", + "Branch | \n", + "484.013411 | \n", + "Four-Door Car | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
4 | \n", + "4 | \n", + "QA50777 | \n", + "Oregon | \n", + "9025.067525 | \n", + "No | \n", + "Premium | \n", + "Bachelor | \n", + "2011-01-17 | \n", + "Medical Leave | \n", + "F | \n", + "... | \n", + "7 | \n", + "Personal Auto | \n", + "Personal L2 | \n", + "Offer1 | \n", + "Branch | \n", + "707.925645 | \n", + "Four-Door Car | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
10905 | \n", + "10905 | \n", + "FE99816 | \n", + "Nevada | \n", + "15563.369440 | \n", + "No | \n", + "Premium | \n", + "Bachelor | \n", + "2011-01-19 | \n", + "Unemployed | \n", + "F | \n", + "... | \n", + "7 | \n", + "Personal Auto | \n", + "Personal L1 | \n", + "Offer3 | \n", + "Web | \n", + "1214.400000 | \n", + "Luxury Car | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
10906 | \n", + "10906 | \n", + "KX53892 | \n", + "Oregon | \n", + "5259.444853 | \n", + "No | \n", + "Basic | \n", + "College | \n", + "2011-01-06 | \n", + "Employed | \n", + "F | \n", + "... | \n", + "6 | \n", + "Personal Auto | \n", + "Personal L3 | \n", + "Offer2 | \n", + "Branch | \n", + "273.018929 | \n", + "Four-Door Car | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
10907 | \n", + "10907 | \n", + "TL39050 | \n", + "Arizona | \n", + "23893.304100 | \n", + "No | \n", + "Extended | \n", + "Bachelor | \n", + "2011-02-06 | \n", + "Employed | \n", + "F | \n", + "... | \n", + "2 | \n", + "Corporate Auto | \n", + "Corporate L3 | \n", + "Offer1 | \n", + "Web | \n", + "381.306996 | \n", + "Luxury SUV | \n", + "Medsize | \n", + "A | \n", + "2 | \n", + "
10908 | \n", + "10908 | \n", + "WA60547 | \n", + "California | \n", + "11971.977650 | \n", + "No | \n", + "Premium | \n", + "College | \n", + "2011-02-13 | \n", + "Employed | \n", + "F | \n", + "... | \n", + "6 | \n", + "Personal Auto | \n", + "Personal L1 | \n", + "Offer1 | \n", + "Branch | \n", + "618.288849 | \n", + "SUV | \n", + "Medsize | \n", + "A | \n", + "2 | \n", + "
10909 | \n", + "10909 | \n", + "IV32877 | \n", + "California | \n", + "6857.519928 | \n", + "No | \n", + "Basic | \n", + "Bachelor | \n", + "2011-01-08 | \n", + "Unemployed | \n", + "M | \n", + "... | \n", + "3 | \n", + "Personal Auto | \n", + "Personal L1 | \n", + "Offer4 | \n", + "Web | \n", + "1021.719397 | \n", + "SUV | \n", + "Medsize | \n", + "A | \n", + "1 | \n", + "
10910 rows × 27 columns
\n", + "