From d594d3c5a5bc5ab58952aabb5ab68a6b5423525e Mon Sep 17 00:00:00 2001
From: SofiaPS-bio <sofia.scomazzon@gmail.com>
Date: Mon, 15 Sep 2025 21:25:23 +0200
Subject: [PATCH] Solved lab

---
 lab-dw-pandas.ipynb | 595 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 578 insertions(+), 17 deletions(-)
diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb
index fbd468314..e253efac0 100644
--- a/lab-dw-pandas.ipynb
+++ b/lab-dw-pandas.ipynb
@@ -82,12 +82,490 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4008, 11)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code here"
+    "# Your code here\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n",
+    "\n",
+    "df = pd.read_csv(url)\n",
+    "\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "38a37ac0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Customer</th>\n",
+       "      <th>ST</th>\n",
+       "      <th>GENDER</th>\n",
+       "      <th>Education</th>\n",
+       "      <th>Customer Lifetime Value</th>\n",
+       "      <th>Income</th>\n",
+       "      <th>Monthly Premium Auto</th>\n",
+       "      <th>Number of Open Complaints</th>\n",
+       "      <th>Policy Type</th>\n",
+       "      <th>Vehicle Class</th>\n",
+       "      <th>Total Claim Amount</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>RB50392</td>\n",
+       "      <td>Washington</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Master</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>2.704934</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>QZ44356</td>\n",
+       "      <td>Arizona</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>697953.59%</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>94.0</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>1131.464935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>AI49188</td>\n",
+       "      <td>Nevada</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>1288743.17%</td>\n",
+       "      <td>48767.0</td>\n",
+       "      <td>108.0</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Two-Door Car</td>\n",
+       "      <td>566.472247</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>WW63253</td>\n",
+       "      <td>California</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>764586.18%</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>106.0</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Corporate Auto</td>\n",
+       "      <td>SUV</td>\n",
+       "      <td>529.881344</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GA49547</td>\n",
+       "      <td>Washington</td>\n",
+       "      <td>M</td>\n",
+       "      <td>High School or Below</td>\n",
+       "      <td>536307.65%</td>\n",
+       "      <td>36357.0</td>\n",
+       "      <td>68.0</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "      <td>17.269323</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  Customer          ST GENDER             Education Customer Lifetime Value  \\\n",
+       "0  RB50392  Washington    NaN                Master                     NaN   \n",
+       "1  QZ44356     Arizona      F              Bachelor              697953.59%   \n",
+       "2  AI49188      Nevada      F              Bachelor             1288743.17%   \n",
+       "3  WW63253  California      M              Bachelor              764586.18%   \n",
+       "4  GA49547  Washington      M  High School or Below              536307.65%   \n",
+       "\n",
+       "    Income  Monthly Premium Auto Number of Open Complaints     Policy Type  \\\n",
+       "0      0.0                1000.0                    1/0/00   Personal Auto   \n",
+       "1      0.0                  94.0                    1/0/00   Personal Auto   \n",
+       "2  48767.0                 108.0                    1/0/00   Personal Auto   \n",
+       "3      0.0                 106.0                    1/0/00  Corporate Auto   \n",
+       "4  36357.0                  68.0                    1/0/00   Personal Auto   \n",
+       "\n",
+       "   Vehicle Class  Total Claim Amount  \n",
+       "0  Four-Door Car            2.704934  \n",
+       "1  Four-Door Car         1131.464935  \n",
+       "2   Two-Door Car          566.472247  \n",
+       "3            SUV          529.881344  \n",
+       "4  Four-Door Car           17.269323  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e180114c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Customer                      object\n",
+       "ST                            object\n",
+       "GENDER                        object\n",
+       "Education                     object\n",
+       "Customer Lifetime Value       object\n",
+       "Income                       float64\n",
+       "Monthly Premium Auto         float64\n",
+       "Number of Open Complaints     object\n",
+       "Policy Type                   object\n",
+       "Vehicle Class                 object\n",
+       "Total Claim Amount           float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dtypes\n",
+    "\n",
+    "#No, some should be int or float\n",
+    "#We need to clean the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f0a24467",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Customer                     1071\n",
+       "ST                              8\n",
+       "GENDER                          5\n",
+       "Education                       6\n",
+       "Customer Lifetime Value      1027\n",
+       "Income                        774\n",
+       "Monthly Premium Auto          132\n",
+       "Number of Open Complaints       6\n",
+       "Policy Type                     3\n",
+       "Vehicle Class                   6\n",
+       "Total Claim Amount            761\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.nunique()\n",
+    "\n",
+    "#Categporicals: ST (states), gender, Educataion, policy type and vehicle class.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "47422330",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ST\n",
+      "Oregon        320\n",
+      "California    211\n",
+      "Arizona       186\n",
+      "Cali          120\n",
+      "Nevada         98\n",
+      "Washington     81\n",
+      "WA             30\n",
+      "AZ             25\n",
+      "Name: count, dtype: int64\n",
+      "GENDER\n",
+      "F         457\n",
+      "M         413\n",
+      "Male       39\n",
+      "female     28\n",
+      "Femal      17\n",
+      "Name: count, dtype: int64\n",
+      "Education\n",
+      "Bachelor                324\n",
+      "College                 313\n",
+      "High School or Below    296\n",
+      "Master                   94\n",
+      "Doctor                   37\n",
+      "Bachelors                 7\n",
+      "Name: count, dtype: int64\n",
+      "Policy Type\n",
+      "Personal Auto     780\n",
+      "Corporate Auto    234\n",
+      "Special Auto       57\n",
+      "Name: count, dtype: int64\n",
+      "Vehicle Class\n",
+      "Four-Door Car    576\n",
+      "Two-Door Car     205\n",
+      "SUV              199\n",
+      "Sports Car        57\n",
+      "Luxury SUV        20\n",
+      "Luxury Car        14\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "#df[\"ST\"].value_counts()\n",
+    "#df[\"GENDER\"].value_counts()\n",
+    "#df[\"Education\"].value_counts()\n",
+    "#df[\"Policy Type\"].value_counts()\n",
+    "#df[\"Vehicle Class\"].value_counts()\n",
+    "\n",
+    "for col in [\"ST\",\"GENDER\",\"Education\",\"Policy Type\",\"Vehicle Class\"]:\n",
+    "    print(df[col].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b1ae3b37",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.382107"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"Income\"].max()\n",
+    "df[\"Income\"].min()\n",
+    "df[\"Monthly Premium Auto\"].max()\n",
+    "df[\"Monthly Premium Auto\"].min()\n",
+    "df[\"Total Claim Amount\"].max()\n",
+    "df[\"Total Claim Amount\"].min()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cea109d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "count     1071.000000\n",
+      "mean     39295.701214\n",
+      "std      30469.427060\n",
+      "min          0.000000\n",
+      "25%      14072.000000\n",
+      "50%      36234.000000\n",
+      "75%      64631.000000\n",
+      "max      99960.000000\n",
+      "Name: Income, dtype: float64\n",
+      "count     1071.000000\n",
+      "mean       193.234360\n",
+      "std       1601.190369\n",
+      "min         61.000000\n",
+      "25%         68.000000\n",
+      "50%         83.000000\n",
+      "75%        109.500000\n",
+      "max      35354.000000\n",
+      "Name: Monthly Premium Auto, dtype: float64\n",
+      "count    1071.000000\n",
+      "mean      404.986909\n",
+      "std       293.027260\n",
+      "min         0.382107\n",
+      "25%       202.157702\n",
+      "50%       354.729129\n",
+      "75%       532.800000\n",
+      "max      2893.239678\n",
+      "Name: Total Claim Amount, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df[\"Income\"].describe())\n",
+    "print(df[\"Monthly Premium Auto\"].describe())\n",
+    "print(df[\"Total Claim Amount\"].describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7d79cf29",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Customer</th>\n",
+       "      <th>ST</th>\n",
+       "      <th>GENDER</th>\n",
+       "      <th>Education</th>\n",
+       "      <th>Customer Lifetime Value</th>\n",
+       "      <th>Number of Open Complaints</th>\n",
+       "      <th>Policy Type</th>\n",
+       "      <th>Vehicle Class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>1071</td>\n",
+       "      <td>1071</td>\n",
+       "      <td>954</td>\n",
+       "      <td>1071</td>\n",
+       "      <td>1068</td>\n",
+       "      <td>1071</td>\n",
+       "      <td>1071</td>\n",
+       "      <td>1071</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>1071</td>\n",
+       "      <td>8</td>\n",
+       "      <td>5</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1027</td>\n",
+       "      <td>6</td>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>MY31220</td>\n",
+       "      <td>Oregon</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Bachelor</td>\n",
+       "      <td>251459.20%</td>\n",
+       "      <td>1/0/00</td>\n",
+       "      <td>Personal Auto</td>\n",
+       "      <td>Four-Door Car</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>1</td>\n",
+       "      <td>320</td>\n",
+       "      <td>457</td>\n",
+       "      <td>324</td>\n",
+       "      <td>4</td>\n",
+       "      <td>830</td>\n",
+       "      <td>780</td>\n",
+       "      <td>576</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Customer      ST GENDER Education Customer Lifetime Value  \\\n",
+       "count      1071    1071    954      1071                    1068   \n",
+       "unique     1071       8      5         6                    1027   \n",
+       "top     MY31220  Oregon      F  Bachelor              251459.20%   \n",
+       "freq          1     320    457       324                       4   \n",
+       "\n",
+       "       Number of Open Complaints    Policy Type  Vehicle Class  \n",
+       "count                       1071           1071           1071  \n",
+       "unique                         6              3              6  \n",
+       "top                       1/0/00  Personal Auto  Four-Door Car  \n",
+       "freq                         830            780            576  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.describe(include=\"object\")"
    ]
   },
   {
@@ -116,12 +594,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "2dca5073-4520-4f42-9390-4b92733284ed",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ST\n",
+       "AZ             25\n",
+       "WA             30\n",
+       "Washington     81\n",
+       "Nevada         98\n",
+       "Cali          120\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code here"
+    "# Your code here\n",
+    "df_location = df.ST\n",
+    "df_location.value_counts().sort_values().head()"
    ]
   },
   {
@@ -146,12 +643,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Personal Auto'"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code here"
+    "# Your code here\n",
+    "df_policy = df[\"Policy Type\"]\n",
+    "\n",
+    "df_policy.value_counts().idxmax()"
    ]
   },
   {
@@ -176,12 +687,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "id": "0c0563cf-6f8b-463d-a321-651a972f82e5",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "38180.69871794872\n",
+      "41390.31196581197\n"
+     ]
+    }
+   ],
    "source": [
-    "# Your code here"
+    "# Your code here\n",
+    "\n",
+    "df_PA = df[df[\"Policy Type\"] == \"Personal Auto\"]\n",
+    "df_CA = df[df[\"Policy Type\"] == \"Corporate Auto\"]\n",
+    "\n",
+    "print (df_PA[\"Income\"].mean())\n",
+    "print (df_CA[\"Income\"].mean())"
    ]
   },
   {
@@ -226,18 +752,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
    "id": "b731bca6-a760-4860-a27b-a33efa712ce0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1002.0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "count    1002.000000\n",
+       "mean      431.165318\n",
+       "std       284.818254\n",
+       "min        48.517439\n",
+       "25%       260.201409\n",
+       "50%       375.996255\n",
+       "75%       542.031802\n",
+       "max      2893.239678\n",
+       "Name: Total Claim Amount, dtype: float64"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Your code here"
+    "# Your code here\n",
+    "df_claim = df[\"Total Claim Amount\"]\n",
+    "df_claim.describe()\n",
+    "\n",
+    "n75 = 4008*0.25\n",
+    "print(n75)\n",
+    "\n",
+    "claim_75 = df_claim.sort_values(ascending=False).head(1002)\n",
+    "\n",
+    "claim_75.describe()"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -251,7 +812,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,

	Customer	ST	GENDER	Education	Customer Lifetime Value	Income	Monthly Premium Auto	Number of Open Complaints	Policy Type	Vehicle Class	Total Claim Amount
0	RB50392	Washington	NaN	Master	NaN	0.0	1000.0	1/0/00	Personal Auto	Four-Door Car	2.704934
1	QZ44356	Arizona	F	Bachelor	697953.59%	0.0	94.0	1/0/00	Personal Auto	Four-Door Car	1131.464935
2	AI49188	Nevada	F	Bachelor	1288743.17%	48767.0	108.0	1/0/00	Personal Auto	Two-Door Car	566.472247
3	WW63253	California	M	Bachelor	764586.18%	0.0	106.0	1/0/00	Corporate Auto	SUV	529.881344
4	GA49547	Washington	M	High School or Below	536307.65%	36357.0	68.0	1/0/00	Personal Auto	Four-Door Car	17.269323
	Customer	ST	GENDER	Education	Customer Lifetime Value	Number of Open Complaints	Policy Type	Vehicle Class
count	1071	1071	954	1071	1068	1071	1071	1071
unique	1071	8	5	6	1027	6	3	6
top	MY31220	Oregon	F	Bachelor	251459.20%	1/0/00	Personal Auto	Four-Door Car
freq	1	320	457	324	4	830	780	576