diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb
index ec4e3f9..9f9f179 100644
--- a/lab-dw-data-structuring-and-combining.ipynb
+++ b/lab-dw-data-structuring-and-combining.ipynb
@@ -36,14 +36,242 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "492d06e3-92c7-4105-ac72-536db98d3244",
"metadata": {
"id": "492d06e3-92c7-4105-ac72-536db98d3244"
},
"outputs": [],
"source": [
- "# Your code goes here"
+ "import pandas as pd\n",
+ "\n",
+ "file1 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')\n",
+ "file2 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv')\n",
+ "file3 = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6fb09589",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.concat([file1, file2, file3], ignore_index=True)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "1a0c14dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_data(df):\n",
+ " df.columns = (df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_'))\n",
+ "\n",
+ " df.drop_duplicates(inplace=True)\n",
+ "\n",
+ " df.dropna(how='all', inplace=True)\n",
+ "\n",
+ " if 'date' in df.columns:\n",
+ " df['date'] = pd.to_datetime(df['date'], errors='coerce')\n",
+ "\n",
+ " print(\"Missing values per column:\")\n",
+ " print(df.isnull().sum())\n",
+ "\n",
+ " return df\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3ac200a2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Missing values per column:\n",
+ "customer 0\n",
+ "st 7070\n",
+ "gender 7192\n",
+ "education 0\n",
+ "customer_lifetime_value 7\n",
+ "income 0\n",
+ "monthly_premium_auto 0\n",
+ "number_of_open_complaints 0\n",
+ "policy_type 0\n",
+ "vehicle_class 0\n",
+ "total_claim_amount 0\n",
+ "state 2064\n",
+ "gender 2064\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "cleaned_df = clean_data(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6f6da246",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer | \n",
+ " st | \n",
+ " gender | \n",
+ " education | \n",
+ " customer_lifetime_value | \n",
+ " income | \n",
+ " monthly_premium_auto | \n",
+ " number_of_open_complaints | \n",
+ " policy_type | \n",
+ " vehicle_class | \n",
+ " total_claim_amount | \n",
+ " state | \n",
+ " gender | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " RB50392 | \n",
+ " Washington | \n",
+ " NaN | \n",
+ " Master | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 1000.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 2.704934 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " QZ44356 | \n",
+ " Arizona | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 697953.59% | \n",
+ " 0.0 | \n",
+ " 94.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 1131.464935 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " AI49188 | \n",
+ " Nevada | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 1288743.17% | \n",
+ " 48767.0 | \n",
+ " 108.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Two-Door Car | \n",
+ " 566.472247 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " WW63253 | \n",
+ " California | \n",
+ " M | \n",
+ " Bachelor | \n",
+ " 764586.18% | \n",
+ " 0.0 | \n",
+ " 106.0 | \n",
+ " 1/0/00 | \n",
+ " Corporate Auto | \n",
+ " SUV | \n",
+ " 529.881344 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " GA49547 | \n",
+ " Washington | \n",
+ " M | \n",
+ " High School or Below | \n",
+ " 536307.65% | \n",
+ " 36357.0 | \n",
+ " 68.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 17.269323 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer st gender education customer_lifetime_value \\\n",
+ "0 RB50392 Washington NaN Master NaN \n",
+ "1 QZ44356 Arizona F Bachelor 697953.59% \n",
+ "2 AI49188 Nevada F Bachelor 1288743.17% \n",
+ "3 WW63253 California M Bachelor 764586.18% \n",
+ "4 GA49547 Washington M High School or Below 536307.65% \n",
+ "\n",
+ " income monthly_premium_auto number_of_open_complaints policy_type \\\n",
+ "0 0.0 1000.0 1/0/00 Personal Auto \n",
+ "1 0.0 94.0 1/0/00 Personal Auto \n",
+ "2 48767.0 108.0 1/0/00 Personal Auto \n",
+ "3 0.0 106.0 1/0/00 Corporate Auto \n",
+ "4 36357.0 68.0 1/0/00 Personal Auto \n",
+ "\n",
+ " vehicle_class total_claim_amount state gender \n",
+ "0 Four-Door Car 2.704934 NaN NaN \n",
+ "1 Four-Door Car 1131.464935 NaN NaN \n",
+ "2 Two-Door Car 566.472247 NaN NaN \n",
+ "3 SUV 529.881344 NaN NaN \n",
+ "4 Four-Door Car 17.269323 NaN NaN "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cleaned_df.head()"
]
},
{
@@ -72,14 +300,15 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26",
"metadata": {
"id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26"
},
"outputs": [],
"source": [
- "# Your code goes here"
+ "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+ "df = pd.read_csv(url)"
]
},
{
@@ -93,6 +322,91 @@
"Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3d92b96d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " total_claim_amount | \n",
+ "
\n",
+ " \n",
+ " sales_channel | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Agent | \n",
+ " 1810226.82 | \n",
+ "
\n",
+ " \n",
+ " Branch | \n",
+ " 1301204.00 | \n",
+ "
\n",
+ " \n",
+ " Call Center | \n",
+ " 926600.82 | \n",
+ "
\n",
+ " \n",
+ " Web | \n",
+ " 706600.04 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " total_claim_amount\n",
+ "sales_channel \n",
+ "Agent 1810226.82\n",
+ "Branch 1301204.00\n",
+ "Call Center 926600.82\n",
+ "Web 706600.04"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "revenue_by_channel = pd.pivot_table(df, values='total_claim_amount', index='sales_channel', aggfunc='sum').round(2)\n",
+ "\n",
+ "\n",
+ "revenue_by_channel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "45d04264",
+ "metadata": {},
+ "source": [
+ "Highest revenue was brought by agents.\n",
+ "Web sales are lowest. Probably because of the duration and information needed for the process.\n"
+ ]
+ },
{
"cell_type": "markdown",
"id": "640993b2-a291-436c-a34d-a551144f8196",
@@ -103,6 +417,96 @@
"2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "50e659c9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " education | \n",
+ " Bachelor | \n",
+ " College | \n",
+ " Doctor | \n",
+ " High School or Below | \n",
+ " Master | \n",
+ "
\n",
+ " \n",
+ " gender | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " F | \n",
+ " 7874.27 | \n",
+ " 7748.82 | \n",
+ " 7328.51 | \n",
+ " 8675.22 | \n",
+ " 8157.05 | \n",
+ "
\n",
+ " \n",
+ " M | \n",
+ " 7703.60 | \n",
+ " 8052.46 | \n",
+ " 7415.33 | \n",
+ " 8149.69 | \n",
+ " 8168.83 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "education Bachelor College Doctor High School or Below Master\n",
+ "gender \n",
+ "F 7874.27 7748.82 7328.51 8675.22 8157.05\n",
+ "M 7703.60 8052.46 7415.33 8149.69 8168.83"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clv_by_gender_education = pd.pivot_table(df, values='customer_lifetime_value', index='gender', columns='education', aggfunc='mean').round(2)\n",
+ "\n",
+ "clv_by_gender_education"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "258ecde6",
+ "metadata": {},
+ "source": [
+ "Female customers who have an education of High School or below have the highest CLV\n",
+ "Hiher education doesn't mean higher CLV"
+ ]
+ },
{
"cell_type": "markdown",
"id": "32c7f2e5-3d90-43e5-be33-9781b6069198",
@@ -130,14 +534,121 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "3a069e0b-b400-470e-904d-d17582191be4",
"metadata": {
"id": "3a069e0b-b400-470e-904d-d17582191be4"
},
"outputs": [],
"source": [
- "# Your code goes here"
+ "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+ "df = pd.read_csv(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "2701079e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['effective_to_date'] = pd.to_datetime(df['effective_to_date'], errors='coerce')\n",
+ "df['month'] = df['effective_to_date'].dt.month_name()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "bcf6774d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "complaints_summary = df.groupby(['policy_type', 'month']).size().reset_index(name='number_of_complaints')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "701ba1f8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " policy_type | \n",
+ " month | \n",
+ " number_of_complaints | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Corporate Auto | \n",
+ " February | \n",
+ " 1089 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Corporate Auto | \n",
+ " January | \n",
+ " 1252 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Personal Auto | \n",
+ " February | \n",
+ " 3799 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Personal Auto | \n",
+ " January | \n",
+ " 4329 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Special Auto | \n",
+ " February | \n",
+ " 204 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " policy_type month number_of_complaints\n",
+ "0 Corporate Auto February 1089\n",
+ "1 Corporate Auto January 1252\n",
+ "2 Personal Auto February 3799\n",
+ "3 Personal Auto January 4329\n",
+ "4 Special Auto February 204"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "complaints_summary.head()"
]
}
],
@@ -146,7 +657,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -160,7 +671,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.11.7"
}
},
"nbformat": 4,