From 5b4c1e00de76df4607d419f79501fcd56f07a2fa Mon Sep 17 00:00:00 2001 From: Miguel Florindo Date: Sat, 13 Sep 2025 16:12:55 +0100 Subject: [PATCH] 'PandasFinishedLab' --- lab-dw-pandas.ipynb | 604 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 587 insertions(+), 17 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..2a6075b66 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,455 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 156, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataSetDimensions: (4008, 11)\n", + "\n", + "RangeIndex: 4008 entries, 0 to 4007\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 1071 non-null object \n", + " 1 ST 1071 non-null object \n", + " 2 GENDER 954 non-null object \n", + " 3 Education 1071 non-null object \n", + " 4 Customer Lifetime Value 1068 non-null object \n", + " 5 Income 1071 non-null float64\n", + " 6 Monthly Premium Auto 1071 non-null float64\n", + " 7 Number of Open Complaints 1071 non-null object \n", + " 8 Policy Type 1071 non-null object \n", + " 9 Vehicle Class 1071 non-null object \n", + " 10 Total Claim Amount 1071 non-null float64\n", + "dtypes: float64(3), object(8)\n", + "memory usage: 344.6+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "import pandas as pd\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "\n", + "print(f\"DataSetDimensions: {df.shape}\")\n", + "\n", + "df.info()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "8992496a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59 0.0 94.0 \n", + "2 1288743.17 48767.0 108.0 \n", + "3 764586.18 0.0 106.0 \n", + "4 536307.65 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]\n", + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington 0 Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 0 0 0 0 \n", + "4004 0 0 0 0 \n", + "4005 0 0 0 0 \n", + "4006 0 0 0 0 \n", + "4007 0 0 0 0 \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 0 0.0 1000.0 \n", + "1 697953.59 0.0 94.0 \n", + "2 1288743.17 48767.0 108.0 \n", + "3 764586.18 0.0 106.0 \n", + "4 536307.65 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 0 0.0 0.0 \n", + "4004 0 0.0 0.0 \n", + "4005 0 0.0 0.0 \n", + "4006 0 0.0 0.0 \n", + "4007 0 0.0 0.0 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 0 0 0 \n", + "4004 0 0 0 \n", + "4005 0 0 0 \n", + "4006 0 0 0 \n", + "4007 0 0 0 \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 0.000000 \n", + "4004 0.000000 \n", + "4005 0.000000 \n", + "4006 0.000000 \n", + "4007 0.000000 \n", + "\n", + "[4008 rows x 11 columns]\n" + ] + } + ], + "source": [ + "\n", + "df[\"Customer Lifetime Value\"] = df[\"Customer Lifetime Value\"].str.replace('%', '', regex=False)\n", + "\n", + "df[\"GENDER\"] = df[\"GENDER\"].str.replace(' ', '', regex=False)\n", + "print(df)\n", + "\n", + "df_clean = df.fillna(0)\n", + "print(df_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "ee780b5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "unique_counts = df.nunique()\n", + "print(unique_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "9c8a316f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'F' 'M']\n", + "\n", + "--- ST ---\n", + "ST\n", + "Oregon 320\n", + "California 211\n", + "Arizona 186\n", + "Cali 120\n", + "Nevada 98\n", + "Washington 81\n", + "WA 30\n", + "AZ 25\n", + "Name: count, dtype: int64\n", + "\n", + "--- GENDER ---\n", + "GENDER\n", + "F 502\n", + "M 452\n", + "Name: count, dtype: int64\n", + "\n", + "--- Education ---\n", + "Education\n", + "Bachelor 324\n", + "College 313\n", + "High School or Below 296\n", + "Master 94\n", + "Doctor 37\n", + "Bachelors 7\n", + "Name: count, dtype: int64\n", + "\n", + "--- Policy Type ---\n", + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "\n", + "--- Vehicle Class ---\n", + "Vehicle Class\n", + "Four-Door Car 576\n", + "Two-Door Car 205\n", + "SUV 199\n", + "Sports Car 57\n", + "Luxury SUV 20\n", + "Luxury Car 14\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "\n", + "\n", + "gender_mapping = {\n", + " 'F': 'F', 'Femal': 'F', 'female': 'F', 'Male': 'M', 'M': 'M',\n", + "}\n", + "\n", + "df['GENDER'] = df['GENDER'].replace(gender_mapping)\n", + "\n", + "print(df['GENDER'].unique())\n", + "\n", + "categorical_cols = ['ST', 'GENDER', 'Education', 'Policy Type', 'Vehicle Class']\n", + "for col in categorical_cols:\n", + " print(f\"\\n--- {col} ---\")\n", + " print(df[col].value_counts().head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "6fee1ea8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678\n", + "mode 0.000000 65.000000 321.600000\n" + ] + } + ], + "source": [ + "numerical_cols = ['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Number of Open Complaints', 'Total Claim Amount']\n", + "summary = df[numerical_cols].describe()\n", + "\n", + "\n", + "mode_vals = df[numerical_cols].mode().iloc[0]\n", + "summary.loc['mode'] = mode_vals\n", + "\n", + "print(summary)" ] }, { @@ -116,12 +559,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 161, + "id": "243dfc12", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "State frequencies (descending):\n", + "ST\n", + "Oregon 320\n", + "California 211\n", + "Arizona 186\n", + "Cali 120\n", + "Nevada 98\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "most_frequent = df['ST'].value_counts().head(5)\n", + "\n", + "print(\"State frequencies (descending):\")\n", + "print(most_frequent)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "State frequencies (ascending):\n", + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64\n" + ] + } + ], "source": [ - "# Your code here" + "least_frequent = df['ST'].value_counts(ascending=True).head(5)\n", + "print(\"\\nState frequencies (ascending):\")\n", + "print(least_frequent)" ] }, { @@ -146,12 +635,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 163, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of policies sold by type:\n", + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "\n", + "Policy type with the highest sales: 'Personal Auto'\n" + ] + } + ], "source": [ - "# Your code here" + "policy_counts = df['Policy Type'].value_counts()\n", + "\n", + "print(\"Number of policies sold by type:\")\n", + "print(policy_counts)\n", + "\n", + "top_policy_type = policy_counts.idxmax()\n", + "\n", + "print(f\"\\nPolicy type with the highest sales: '{top_policy_type}'\")" ] }, { @@ -176,12 +687,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 164, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Income by Policy Type:\n", + "Personal Auto: $38,180.70\n", + "Corporate Auto: $41,390.31\n" + ] + } + ], "source": [ - "# Your code here" + "personal_auto_df = df.loc[df['Policy Type'] == 'Personal Auto']\n", + "\n", + "corporate_auto_df = df.loc[df['Policy Type'] == 'Corporate Auto']\n", + "\n", + "avg_income_personal = personal_auto_df['Income'].mean()\n", + "avg_income_corporate = corporate_auto_df['Income'].mean()\n", + "\n", + "print(\"Average Income by Policy Type:\")\n", + "print(f\"Personal Auto: ${avg_income_personal:,.2f}\")\n", + "print(f\"Corporate Auto: ${avg_income_corporate:,.2f}\")" ] }, { @@ -226,18 +756,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 165, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 264.000000 264.000000 264.000000\n", + "mean 23677.344697 165.193182 782.228263\n", + "std 27013.483721 623.930992 292.751640\n", + "min 0.000000 63.000000 537.600000\n", + "25% 0.000000 99.000000 606.521741\n", + "50% 18807.000000 114.000000 679.597985\n", + "75% 42423.750000 133.250000 851.400000\n", + "max 99316.000000 10202.000000 2893.239678\n", + " Total Claim Amount Income Monthly Premium Auto\n", + "count 264.000000 264.000000 264.000000\n", + "mean 782.228263 23677.344697 165.193182\n", + "std 292.751640 27013.483721 623.930992\n", + "min 537.600000 0.000000 63.000000\n", + "25% 606.521741 0.000000 99.000000\n", + "50% 679.597985 18807.000000 114.000000\n", + "75% 851.400000 42423.750000 133.250000\n", + "max 2893.239678 99316.000000 10202.000000\n" + ] + } + ], "source": [ - "# Your code here" + "claim_75th = df['Total Claim Amount'].quantile(0.75)\n", + "\n", + "high_claim_df = df.loc[df['Total Claim Amount'] > claim_75th]\n", + "\n", + "key_columns = ['Total Claim Amount', 'Income', 'Customer Lifetime Value', 'Monthly Premium Auto']\n", + "\n", + "print(high_claim_df.describe())\n", + "print(high_claim_df[key_columns].describe())" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d6166e3", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -251,7 +821,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,