From 622785cea6c32844c8a628cf263545102cfb5512 Mon Sep 17 00:00:00 2001 From: martin-paz-y Date: Sun, 14 Sep 2025 12:20:32 +0200 Subject: [PATCH] Solved Lab --- lab-dw-pandas.ipynb | 619 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 607 insertions(+), 12 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..3c8cb1b05 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,483 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, + "id": "3a795497-28e5-41d5-a6c6-e3c51324956a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 46, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "labdata_df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "e49190cf-affb-4561-9a70-207823788fa9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labdata_df # Identify the dimensions of the dataset by determining the number of rows and columns it contains." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "708298fc-c738-438c-a38f-b445dc2dc35e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4008 entries, 0 to 4007\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 1071 non-null object \n", + " 1 ST 1071 non-null object \n", + " 2 GENDER 954 non-null object \n", + " 3 Education 1071 non-null object \n", + " 4 Customer Lifetime Value 1068 non-null object \n", + " 5 Income 1071 non-null float64\n", + " 6 Monthly Premium Auto 1071 non-null float64\n", + " 7 Number of Open Complaints 1071 non-null object \n", + " 8 Policy Type 1071 non-null object \n", + " 9 Vehicle Class 1071 non-null object \n", + " 10 Total Claim Amount 1071 non-null float64\n", + "dtypes: float64(3), object(8)\n", + "memory usage: 344.6+ KB\n" + ] + } + ], + "source": [ + "# Determine the data types of each column and evaluate whether they are appropriate for the nature of the variable. You should also provide suggestions for fixing any incorrect data types.\n", + "\n", + "labdata_df .info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4d039387-1214-497f-8b90-06d0869b324d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#- Identify the number of unique values for each column and determine which columns appear to be categorical. You should also describe the unique values of each categorical column and the range of values for numerical columns, and give your insights.\n", + "\n", + "labdata_df.nunique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "fed7d569-5971-4e17-9b0c-bd41d8cc63ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Median\n", + " Income 36234.000000\n", + "Monthly Premium Auto 83.000000\n", + "Total Claim Amount 354.729129\n", + "dtype: float64\n", + "This is Mean\n", + " Income 39295.701214\n", + "Monthly Premium Auto 193.234360\n", + "Total Claim Amount 404.986909\n", + "dtype: float64\n", + "This is Mode\n", + " Income Monthly Premium Auto Total Claim Amount\n", + "0 0.0 65.0 321.6\n", + "This is Std\n", + " Income 30469.427060\n", + "Monthly Premium Auto 1601.190369\n", + "Total Claim Amount 293.027260\n", + "dtype: float64\n", + "This is Q1 (25%)\n", + " Income 14072.000000\n", + "Monthly Premium Auto 68.000000\n", + "Total Claim Amount 202.157702\n", + "Name: 0.25, dtype: float64\n", + "This is Q2 (50% / Median)\n", + " Income 36234.000000\n", + "Monthly Premium Auto 83.000000\n", + "Total Claim Amount 354.729129\n", + "Name: 0.5, dtype: float64\n", + "This is Q3 (75%)\n", + " Income 64631.0\n", + "Monthly Premium Auto 109.5\n", + "Total Claim Amount 532.8\n", + "Name: 0.75, dtype: float64\n" + ] + } + ], + "source": [ + "#- Compute summary statistics such as mean, median, mode, standard deviation, and quartiles to understand the central tendency and distribution of the data for numerical columns. You should also provide your conclusions based on these summary statistics.\n", + "\n", + "# Selecciona solo las columnas numéricas\n", + "numeric_columns = labdata_df.select_dtypes(include='number')\n", + "\n", + "# Calcula las estadísticas\n", + "median = numeric_columns.median()\n", + "mean = numeric_columns.mean()\n", + "mode = numeric_columns.mode()\n", + "std = numeric_columns.std()\n", + "q1 = numeric_columns.quantile(0.25) # 25%\n", + "q2 = numeric_columns.quantile(0.50) # 50% = median\n", + "q3 = numeric_columns.quantile(0.75) # 75%\n", + "\n", + "# Muestra resultados\n", + "print(\"This is Median\\n\", median)\n", + "print(\"This is Mean\\n\", mean)\n", + "print(\"This is Mode\\n\", mode) # <-- aquí estaba mal en tu print\n", + "print(\"This is Std\\n\", std) # <-- aquí también estaba mal el texto\n", + "print(\"This is Q1 (25%)\\n\", q1)\n", + "print(\"This is Q2 (50% / Median)\\n\", q2)\n", + "print(\"This is Q3 (75%)\\n\", q3)\n", + "\n", + "# Insights: Income big variailty STD / Monthly premiun auto Difference mean and median (meaning bit outliners)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "1c7ce63f-4aa3-4d2c-b0a7-718f384318b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Count\n", + " Customer 1071\n", + "ST 1071\n", + "GENDER 954\n", + "Education 1071\n", + "Customer Lifetime Value 1068\n", + "Number of Open Complaints 1071\n", + "Policy Type 1071\n", + "Vehicle Class 1071\n", + "dtype: int64\n", + "This is Unique\n", + " Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "#- Compute summary statistics for categorical columns and providing your conclusions based on these statistics.\n", + "\n", + "# Selecciona solo las columnas numéricas\n", + "object_int_columns = labdata_df.select_dtypes(include=[\"object\", \"int64\"])\n", + "\n", + "# Calcula las estadísticas\n", + "count = object_int_columns.count()\n", + "nunique = object_int_columns.nunique()\n", + "\n", + "# Muestra resultados\n", + "print(\"This is Count\\n\", count)\n", + "print(\"This is Unique\\n\", nunique)\n", + "\n", + "#Insighs: Gender has missing values (~11%), Customer Lifetime Value is conseidered and obkjetc cos of %, and Policy Type, State, and Education have few categories useful for segmentation.\n", + "\n" ] }, { @@ -116,12 +587,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "\n", + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "labdata_df_state = pd.read_csv(url, usecols=[\"ST\"])\n", + "\n", + "labdata_df_state[\"ST\"].value_counts(ascending=True)[0:5] \n" ] }, { @@ -146,12 +638,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RangeIndex(start=0, stop=4008, step=1)\n", + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n" + ] + } + ], "source": [ - "# Your code here" + "\n", + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "labdata_df_policy = pd.read_csv(url, usecols=[\"Policy Type\"])\n", + "\n", + "print(labdata_df_policy.index)\n", + "\n", + "\n", + "print(labdata_df_policy[\"Policy Type\"].value_counts()) #ojo noolvidarse de los ()\n", + "\n", + "\n" ] }, { @@ -176,12 +690,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Policy Type\n", + "Corporate Auto 41390.311966\n", + "Personal Auto 38180.698718\n", + "Special Auto 45954.701754\n", + "Name: Income, dtype: float64" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "\n", + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "labdata_df_policy_income = pd.read_csv(url, usecols=[\"Policy Type\",\"Income\"])\n", + "\n", + "#Use loc to create two dataframes: one containing only Personal Auto policies and one containing only Corporate Auto policies.\n", + "\n", + "\n", + "\n", + "avg_income_per_policy = labdata_df_policy_income.groupby(\"Policy Type\")[\"Income\"].mean()\n", + "\n", + "avg_income_per_policy\n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "2b6d7061-b47c-431b-a040-05407ec2ddd6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Personal Auto policies:\n", + " Income Policy Type\n", + "0 0.0 Personal Auto\n", + "1 0.0 Personal Auto\n", + "2 48767.0 Personal Auto\n", + "4 36357.0 Personal Auto\n", + "5 62902.0 Personal Auto \n", + "\n", + "Corporate Auto policies:\n", + " Income Policy Type\n", + "3 0.0 Corporate Auto\n", + "6 55350.0 Corporate Auto\n", + "7 0.0 Corporate Auto\n", + "8 14072.0 Corporate Auto\n", + "12 77026.0 Corporate Auto \n", + "\n", + "Average income (Personal Auto): 38180.69871794872\n", + "Average income (Corporate Auto): 41390.31196581197\n" + ] + } + ], + "source": [ + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "labdata_df_policy_income = pd.read_csv(url, usecols=[\"Policy Type\",\"Income\"])\n", + "\n", + "# Usa el nombre correcto del DataFrame\n", + "personal_auto_df = labdata_df_policy_income.loc[\n", + " labdata_df_policy_income[\"Policy Type\"] == \"Personal Auto\"\n", + "]\n", + "\n", + "corporate_auto_df = labdata_df_policy_income.loc[\n", + " labdata_df_policy_income[\"Policy Type\"] == \"Corporate Auto\"\n", + "]\n", + "\n", + "print(\"Personal Auto policies:\\n\", personal_auto_df.head(), \"\\n\")\n", + "print(\"Corporate Auto policies:\\n\", corporate_auto_df.head(), \"\\n\")\n", + "\n", + "# 👉 Cálculo de los promedios\n", + "avg_income_personal = personal_auto_df[\"Income\"].mean()\n", + "avg_income_corporate = corporate_auto_df[\"Income\"].mean()\n", + "\n", + "print(\"Average income (Personal Auto):\", avg_income_personal)\n", + "print(\"Average income (Corporate Auto):\", avg_income_corporate)\n", + "\n" ] }, { @@ -251,7 +846,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.7" } }, "nbformat": 4,