diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb
index fbd46831..e253efac 100644
--- a/lab-dw-pandas.ipynb
+++ b/lab-dw-pandas.ipynb
@@ -82,12 +82,490 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4008, 11)"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n",
+ "\n",
+ "df = pd.read_csv(url)\n",
+ "\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "38a37ac0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Customer | \n",
+ " ST | \n",
+ " GENDER | \n",
+ " Education | \n",
+ " Customer Lifetime Value | \n",
+ " Income | \n",
+ " Monthly Premium Auto | \n",
+ " Number of Open Complaints | \n",
+ " Policy Type | \n",
+ " Vehicle Class | \n",
+ " Total Claim Amount | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " RB50392 | \n",
+ " Washington | \n",
+ " NaN | \n",
+ " Master | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 1000.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 2.704934 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " QZ44356 | \n",
+ " Arizona | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 697953.59% | \n",
+ " 0.0 | \n",
+ " 94.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 1131.464935 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " AI49188 | \n",
+ " Nevada | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 1288743.17% | \n",
+ " 48767.0 | \n",
+ " 108.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Two-Door Car | \n",
+ " 566.472247 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " WW63253 | \n",
+ " California | \n",
+ " M | \n",
+ " Bachelor | \n",
+ " 764586.18% | \n",
+ " 0.0 | \n",
+ " 106.0 | \n",
+ " 1/0/00 | \n",
+ " Corporate Auto | \n",
+ " SUV | \n",
+ " 529.881344 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " GA49547 | \n",
+ " Washington | \n",
+ " M | \n",
+ " High School or Below | \n",
+ " 536307.65% | \n",
+ " 36357.0 | \n",
+ " 68.0 | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ " 17.269323 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Customer ST GENDER Education Customer Lifetime Value \\\n",
+ "0 RB50392 Washington NaN Master NaN \n",
+ "1 QZ44356 Arizona F Bachelor 697953.59% \n",
+ "2 AI49188 Nevada F Bachelor 1288743.17% \n",
+ "3 WW63253 California M Bachelor 764586.18% \n",
+ "4 GA49547 Washington M High School or Below 536307.65% \n",
+ "\n",
+ " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n",
+ "0 0.0 1000.0 1/0/00 Personal Auto \n",
+ "1 0.0 94.0 1/0/00 Personal Auto \n",
+ "2 48767.0 108.0 1/0/00 Personal Auto \n",
+ "3 0.0 106.0 1/0/00 Corporate Auto \n",
+ "4 36357.0 68.0 1/0/00 Personal Auto \n",
+ "\n",
+ " Vehicle Class Total Claim Amount \n",
+ "0 Four-Door Car 2.704934 \n",
+ "1 Four-Door Car 1131.464935 \n",
+ "2 Two-Door Car 566.472247 \n",
+ "3 SUV 529.881344 \n",
+ "4 Four-Door Car 17.269323 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e180114c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Customer object\n",
+ "ST object\n",
+ "GENDER object\n",
+ "Education object\n",
+ "Customer Lifetime Value object\n",
+ "Income float64\n",
+ "Monthly Premium Auto float64\n",
+ "Number of Open Complaints object\n",
+ "Policy Type object\n",
+ "Vehicle Class object\n",
+ "Total Claim Amount float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes\n",
+ "\n",
+ "#No, some should be int or float\n",
+ "#We need to clean the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "f0a24467",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Customer 1071\n",
+ "ST 8\n",
+ "GENDER 5\n",
+ "Education 6\n",
+ "Customer Lifetime Value 1027\n",
+ "Income 774\n",
+ "Monthly Premium Auto 132\n",
+ "Number of Open Complaints 6\n",
+ "Policy Type 3\n",
+ "Vehicle Class 6\n",
+ "Total Claim Amount 761\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.nunique()\n",
+ "\n",
+ "#Categporicals: ST (states), gender, Educataion, policy type and vehicle class.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "47422330",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ST\n",
+ "Oregon 320\n",
+ "California 211\n",
+ "Arizona 186\n",
+ "Cali 120\n",
+ "Nevada 98\n",
+ "Washington 81\n",
+ "WA 30\n",
+ "AZ 25\n",
+ "Name: count, dtype: int64\n",
+ "GENDER\n",
+ "F 457\n",
+ "M 413\n",
+ "Male 39\n",
+ "female 28\n",
+ "Femal 17\n",
+ "Name: count, dtype: int64\n",
+ "Education\n",
+ "Bachelor 324\n",
+ "College 313\n",
+ "High School or Below 296\n",
+ "Master 94\n",
+ "Doctor 37\n",
+ "Bachelors 7\n",
+ "Name: count, dtype: int64\n",
+ "Policy Type\n",
+ "Personal Auto 780\n",
+ "Corporate Auto 234\n",
+ "Special Auto 57\n",
+ "Name: count, dtype: int64\n",
+ "Vehicle Class\n",
+ "Four-Door Car 576\n",
+ "Two-Door Car 205\n",
+ "SUV 199\n",
+ "Sports Car 57\n",
+ "Luxury SUV 20\n",
+ "Luxury Car 14\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "#df[\"ST\"].value_counts()\n",
+ "#df[\"GENDER\"].value_counts()\n",
+ "#df[\"Education\"].value_counts()\n",
+ "#df[\"Policy Type\"].value_counts()\n",
+ "#df[\"Vehicle Class\"].value_counts()\n",
+ "\n",
+ "for col in [\"ST\",\"GENDER\",\"Education\",\"Policy Type\",\"Vehicle Class\"]:\n",
+ " print(df[col].value_counts())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b1ae3b37",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.382107"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"Income\"].max()\n",
+ "df[\"Income\"].min()\n",
+ "df[\"Monthly Premium Auto\"].max()\n",
+ "df[\"Monthly Premium Auto\"].min()\n",
+ "df[\"Total Claim Amount\"].max()\n",
+ "df[\"Total Claim Amount\"].min()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "cea109d7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "count 1071.000000\n",
+ "mean 39295.701214\n",
+ "std 30469.427060\n",
+ "min 0.000000\n",
+ "25% 14072.000000\n",
+ "50% 36234.000000\n",
+ "75% 64631.000000\n",
+ "max 99960.000000\n",
+ "Name: Income, dtype: float64\n",
+ "count 1071.000000\n",
+ "mean 193.234360\n",
+ "std 1601.190369\n",
+ "min 61.000000\n",
+ "25% 68.000000\n",
+ "50% 83.000000\n",
+ "75% 109.500000\n",
+ "max 35354.000000\n",
+ "Name: Monthly Premium Auto, dtype: float64\n",
+ "count 1071.000000\n",
+ "mean 404.986909\n",
+ "std 293.027260\n",
+ "min 0.382107\n",
+ "25% 202.157702\n",
+ "50% 354.729129\n",
+ "75% 532.800000\n",
+ "max 2893.239678\n",
+ "Name: Total Claim Amount, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df[\"Income\"].describe())\n",
+ "print(df[\"Monthly Premium Auto\"].describe())\n",
+ "print(df[\"Total Claim Amount\"].describe())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "7d79cf29",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Customer | \n",
+ " ST | \n",
+ " GENDER | \n",
+ " Education | \n",
+ " Customer Lifetime Value | \n",
+ " Number of Open Complaints | \n",
+ " Policy Type | \n",
+ " Vehicle Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1071 | \n",
+ " 1071 | \n",
+ " 954 | \n",
+ " 1071 | \n",
+ " 1068 | \n",
+ " 1071 | \n",
+ " 1071 | \n",
+ " 1071 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 1071 | \n",
+ " 8 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 1027 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " MY31220 | \n",
+ " Oregon | \n",
+ " F | \n",
+ " Bachelor | \n",
+ " 251459.20% | \n",
+ " 1/0/00 | \n",
+ " Personal Auto | \n",
+ " Four-Door Car | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 1 | \n",
+ " 320 | \n",
+ " 457 | \n",
+ " 324 | \n",
+ " 4 | \n",
+ " 830 | \n",
+ " 780 | \n",
+ " 576 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Customer ST GENDER Education Customer Lifetime Value \\\n",
+ "count 1071 1071 954 1071 1068 \n",
+ "unique 1071 8 5 6 1027 \n",
+ "top MY31220 Oregon F Bachelor 251459.20% \n",
+ "freq 1 320 457 324 4 \n",
+ "\n",
+ " Number of Open Complaints Policy Type Vehicle Class \n",
+ "count 1071 1071 1071 \n",
+ "unique 6 3 6 \n",
+ "top 1/0/00 Personal Auto Four-Door Car \n",
+ "freq 830 780 576 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.describe(include=\"object\")"
]
},
{
@@ -116,12 +594,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"id": "2dca5073-4520-4f42-9390-4b92733284ed",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ST\n",
+ "AZ 25\n",
+ "WA 30\n",
+ "Washington 81\n",
+ "Nevada 98\n",
+ "Cali 120\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "df_location = df.ST\n",
+ "df_location.value_counts().sort_values().head()"
]
},
{
@@ -146,12 +643,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Personal Auto'"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "df_policy = df[\"Policy Type\"]\n",
+ "\n",
+ "df_policy.value_counts().idxmax()"
]
},
{
@@ -176,12 +687,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"id": "0c0563cf-6f8b-463d-a321-651a972f82e5",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "38180.69871794872\n",
+ "41390.31196581197\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "\n",
+ "df_PA = df[df[\"Policy Type\"] == \"Personal Auto\"]\n",
+ "df_CA = df[df[\"Policy Type\"] == \"Corporate Auto\"]\n",
+ "\n",
+ "print (df_PA[\"Income\"].mean())\n",
+ "print (df_CA[\"Income\"].mean())"
]
},
{
@@ -226,18 +752,53 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
"id": "b731bca6-a760-4860-a27b-a33efa712ce0",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1002.0\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "count 1002.000000\n",
+ "mean 431.165318\n",
+ "std 284.818254\n",
+ "min 48.517439\n",
+ "25% 260.201409\n",
+ "50% 375.996255\n",
+ "75% 542.031802\n",
+ "max 2893.239678\n",
+ "Name: Total Claim Amount, dtype: float64"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your code here"
+ "# Your code here\n",
+ "df_claim = df[\"Total Claim Amount\"]\n",
+ "df_claim.describe()\n",
+ "\n",
+ "n75 = 4008*0.25\n",
+ "print(n75)\n",
+ "\n",
+ "claim_75 = df_claim.sort_values(ascending=False).head(1002)\n",
+ "\n",
+ "claim_75.describe()"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -251,7 +812,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.13.5"
}
},
"nbformat": 4,