diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..9f9f179 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,242 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, "outputs": [], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "\n", + "file1 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')\n", + "file2 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv')\n", + "file3 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6fb09589", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([file1, file2, file3], ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1a0c14dd", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_data(df):\n", + " df.columns = (df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_'))\n", + "\n", + " df.drop_duplicates(inplace=True)\n", + "\n", + " df.dropna(how='all', inplace=True)\n", + "\n", + " if 'date' in df.columns:\n", + " df['date'] = pd.to_datetime(df['date'], errors='coerce')\n", + "\n", + " print(\"Missing values per column:\")\n", + " print(df.isnull().sum())\n", + "\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3ac200a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing values per column:\n", + "customer 0\n", + "st 7070\n", + "gender 7192\n", + "education 0\n", + "customer_lifetime_value 7\n", + "income 0\n", + "monthly_premium_auto 0\n", + "number_of_open_complaints 0\n", + "policy_type 0\n", + "vehicle_class 0\n", + "total_claim_amount 0\n", + "state 2064\n", + "gender 2064\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "cleaned_df = clean_data(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6f6da246", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstgendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountstategender
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934NaNNaN
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935NaNNaN
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247NaNNaN
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344NaNNaN
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323NaNNaN
\n", + "
" + ], + "text/plain": [ + " customer st gender education customer_lifetime_value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount state gender \n", + "0 Four-Door Car 2.704934 NaN NaN \n", + "1 Four-Door Car 1131.464935 NaN NaN \n", + "2 Two-Door Car 566.472247 NaN NaN \n", + "3 SUV 529.881344 NaN NaN \n", + "4 Four-Door Car 17.269323 NaN NaN " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_df.head()" ] }, { @@ -72,14 +300,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, "outputs": [], "source": [ - "# Your code goes here" + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)" ] }, { @@ -93,6 +322,91 @@ "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3d92b96d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "revenue_by_channel = pd.pivot_table(df, values='total_claim_amount', index='sales_channel', aggfunc='sum').round(2)\n", + "\n", + "\n", + "revenue_by_channel" + ] + }, + { + "cell_type": "markdown", + "id": "45d04264", + "metadata": {}, + "source": [ + "Highest revenue was brought by agents.\n", + "Web sales are lowest. Probably because of the duration and information needed for the process.\n" + ] + }, { "cell_type": "markdown", "id": "640993b2-a291-436c-a34d-a551144f8196", @@ -103,6 +417,96 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "50e659c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
educationBachelorCollegeDoctorHigh School or BelowMaster
gender
F7874.277748.827328.518675.228157.05
M7703.608052.467415.338149.698168.83
\n", + "
" + ], + "text/plain": [ + "education Bachelor College Doctor High School or Below Master\n", + "gender \n", + "F 7874.27 7748.82 7328.51 8675.22 8157.05\n", + "M 7703.60 8052.46 7415.33 8149.69 8168.83" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clv_by_gender_education = pd.pivot_table(df, values='customer_lifetime_value', index='gender', columns='education', aggfunc='mean').round(2)\n", + "\n", + "clv_by_gender_education" + ] + }, + { + "cell_type": "markdown", + "id": "258ecde6", + "metadata": {}, + "source": [ + "Female customers who have an education of High School or below have the highest CLV\n", + "Hiher education doesn't mean higher CLV" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,14 +534,121 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, "outputs": [], "source": [ - "# Your code goes here" + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n", + "df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2701079e", + "metadata": {}, + "outputs": [], + "source": [ + "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'], errors='coerce')\n", + "df['month'] = df['effective_to_date'].dt.month_name()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bcf6774d", + "metadata": {}, + "outputs": [], + "source": [ + "complaints_summary = df.groupby(['policy_type', 'month']).size().reset_index(name='number_of_complaints')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "701ba1f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
policy_typemonthnumber_of_complaints
0Corporate AutoFebruary1089
1Corporate AutoJanuary1252
2Personal AutoFebruary3799
3Personal AutoJanuary4329
4Special AutoFebruary204
\n", + "
" + ], + "text/plain": [ + " policy_type month number_of_complaints\n", + "0 Corporate Auto February 1089\n", + "1 Corporate Auto January 1252\n", + "2 Personal Auto February 3799\n", + "3 Personal Auto January 4329\n", + "4 Special Auto February 204" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complaints_summary.head()" ] } ], @@ -146,7 +657,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +671,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.7" } }, "nbformat": 4,