From d594d3c5a5bc5ab58952aabb5ab68a6b5423525e Mon Sep 17 00:00:00 2001 From: SofiaPS-bio Date: Mon, 15 Sep 2025 21:25:23 +0200 Subject: [PATCH] Solved lab --- lab-dw-pandas.ipynb | 595 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 578 insertions(+), 17 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..e253efac0 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,490 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(4008, 11)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "38a37ac0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e180114c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes\n", + "\n", + "#No, some should be int or float\n", + "#We need to clean the data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f0a24467", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.nunique()\n", + "\n", + "#Categporicals: ST (states), gender, Educataion, policy type and vehicle class.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "47422330", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ST\n", + "Oregon 320\n", + "California 211\n", + "Arizona 186\n", + "Cali 120\n", + "Nevada 98\n", + "Washington 81\n", + "WA 30\n", + "AZ 25\n", + "Name: count, dtype: int64\n", + "GENDER\n", + "F 457\n", + "M 413\n", + "Male 39\n", + "female 28\n", + "Femal 17\n", + "Name: count, dtype: int64\n", + "Education\n", + "Bachelor 324\n", + "College 313\n", + "High School or Below 296\n", + "Master 94\n", + "Doctor 37\n", + "Bachelors 7\n", + "Name: count, dtype: int64\n", + "Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "Vehicle Class\n", + "Four-Door Car 576\n", + "Two-Door Car 205\n", + "SUV 199\n", + "Sports Car 57\n", + "Luxury SUV 20\n", + "Luxury Car 14\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#df[\"ST\"].value_counts()\n", + "#df[\"GENDER\"].value_counts()\n", + "#df[\"Education\"].value_counts()\n", + "#df[\"Policy Type\"].value_counts()\n", + "#df[\"Vehicle Class\"].value_counts()\n", + "\n", + "for col in [\"ST\",\"GENDER\",\"Education\",\"Policy Type\",\"Vehicle Class\"]:\n", + " print(df[col].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b1ae3b37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.382107" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Income\"].max()\n", + "df[\"Income\"].min()\n", + "df[\"Monthly Premium Auto\"].max()\n", + "df[\"Monthly Premium Auto\"].min()\n", + "df[\"Total Claim Amount\"].max()\n", + "df[\"Total Claim Amount\"].min()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cea109d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 1071.000000\n", + "mean 39295.701214\n", + "std 30469.427060\n", + "min 0.000000\n", + "25% 14072.000000\n", + "50% 36234.000000\n", + "75% 64631.000000\n", + "max 99960.000000\n", + "Name: Income, dtype: float64\n", + "count 1071.000000\n", + "mean 193.234360\n", + "std 1601.190369\n", + "min 61.000000\n", + "25% 68.000000\n", + "50% 83.000000\n", + "75% 109.500000\n", + "max 35354.000000\n", + "Name: Monthly Premium Auto, dtype: float64\n", + "count 1071.000000\n", + "mean 404.986909\n", + "std 293.027260\n", + "min 0.382107\n", + "25% 202.157702\n", + "50% 354.729129\n", + "75% 532.800000\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64\n" + ] + } + ], + "source": [ + "print(df[\"Income\"].describe())\n", + "print(df[\"Monthly Premium Auto\"].describe())\n", + "print(df[\"Total Claim Amount\"].describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7d79cf29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueNumber of Open ComplaintsPolicy TypeVehicle Class
count1071107195410711068107110711071
unique10718561027636
topMY31220OregonFBachelor251459.20%1/0/00Personal AutoFour-Door Car
freq13204573244830780576
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 1071 1071 954 1071 1068 \n", + "unique 1071 8 5 6 1027 \n", + "top MY31220 Oregon F Bachelor 251459.20% \n", + "freq 1 320 457 324 4 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \n", + "count 1071 1071 1071 \n", + "unique 6 3 6 \n", + "top 1/0/00 Personal Auto Four-Door Car \n", + "freq 830 780 576 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include=\"object\")" ] }, { @@ -116,12 +594,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "df_location = df.ST\n", + "df_location.value_counts().sort_values().head()" ] }, { @@ -146,12 +643,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Personal Auto'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "df_policy = df[\"Policy Type\"]\n", + "\n", + "df_policy.value_counts().idxmax()" ] }, { @@ -176,12 +687,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38180.69871794872\n", + "41390.31196581197\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "df_PA = df[df[\"Policy Type\"] == \"Personal Auto\"]\n", + "df_CA = df[df[\"Policy Type\"] == \"Corporate Auto\"]\n", + "\n", + "print (df_PA[\"Income\"].mean())\n", + "print (df_CA[\"Income\"].mean())" ] }, { @@ -226,18 +752,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1002.0\n" + ] + }, + { + "data": { + "text/plain": [ + "count 1002.000000\n", + "mean 431.165318\n", + "std 284.818254\n", + "min 48.517439\n", + "25% 260.201409\n", + "50% 375.996255\n", + "75% 542.031802\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "df_claim = df[\"Total Claim Amount\"]\n", + "df_claim.describe()\n", + "\n", + "n75 = 4008*0.25\n", + "print(n75)\n", + "\n", + "claim_75 = df_claim.sort_values(ascending=False).head(1002)\n", + "\n", + "claim_75.describe()" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -251,7 +812,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,