From fc0bf697a2ed82d36d47eb97b43ca267a15e88be Mon Sep 17 00:00:00 2001 From: Priyanka Marmath Date: Wed, 10 Sep 2025 13:50:02 +0200 Subject: [PATCH] lab solved --- lab-dw-pandas.ipynb | 715 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 698 insertions(+), 17 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..3b95873cb 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,540 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "customer_df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\")\n", + "customer_df #Identify the dimensions of the dataset by determining the number of rows and columns it contains." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "397e3d20-23a6-49ec-bbf1-45dd0f941c50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4008 entries, 0 to 4007\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 1071 non-null object \n", + " 1 ST 1071 non-null object \n", + " 2 GENDER 954 non-null object \n", + " 3 Education 1071 non-null object \n", + " 4 Customer Lifetime Value 1068 non-null object \n", + " 5 Income 1071 non-null float64\n", + " 6 Monthly Premium Auto 1071 non-null float64\n", + " 7 Number of Open Complaints 1071 non-null object \n", + " 8 Policy Type 1071 non-null object \n", + " 9 Vehicle Class 1071 non-null object \n", + " 10 Total Claim Amount 1071 non-null float64\n", + "dtypes: float64(3), object(8)\n", + "memory usage: 344.6+ KB\n" + ] + } + ], + "source": [ + "customer_df.info() #Determine the data types of each column \n", + "#customer lifetime value datatype can also be float64 as the results are in %." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "53a8a0f8-7327-44ca-8c45-855ec4b77f38", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_df.dtypes #Identify the number of unique values for each column and determine which columns appear to be categorical. You should also describe the unique values of each categorical column and the range of values for numerical columns, and give your insights." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "474f7668-e8be-4e80-9f3c-ab4b2bfdb723", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_df.nunique() \n", + "\n", + "#Categorical Columns: Customer, St, gender, Eductaion, Policy Type, Vehicle Class\n", + "#Numerical Columns: Customer Lifetime Value, Income, Monthly Premium Auto, No. of open complaints, Toatl Claim Object" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ce29d83f-b7ba-426d-8e8d-d37f1cfbbc2e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomeMonthly Premium AutoTotal Claim Amount
count1071.0000001071.0000001071.000000
mean39295.701214193.234360404.986909
std30469.4270601601.190369293.027260
min0.00000061.0000000.382107
25%14072.00000068.000000202.157702
50%36234.00000083.000000354.729129
75%64631.000000109.500000532.800000
max99960.00000035354.0000002893.239678
\n", + "
" + ], + "text/plain": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_df.describe()\n", + "#Compute summary statistics such as mean, median, mode, standard deviation, and quartiles " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "dab8f0b7-a4c3-404b-a992-cd953815f692", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 321.6\n", + "Name: Total Claim Amount, dtype: float64\n" + ] + } + ], + "source": [ + "mode_value= customer_df[\"Total Claim Amount\"].mode()\n", + "print(mode_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cf77ed18-a7d8-4e54-aba8-aa2ed95e4157", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "354.729129\n" + ] + } + ], + "source": [ + "median_value= customer_df[\"Total Claim Amount\"].median()\n", + "print(median_value)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e489da2-ea40-4ccd-bd1d-e20e53fb7d12", + "metadata": {}, + "outputs": [], + "source": [ + "mode_value= customer_df[\"Income\"].mode()\n", + "print(mode_value)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "942cfa56-b7b7-484a-8f52-59008d7b89bf", + "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "median_value= customer_df[\"Income\"].median()\n", + "print(median_value)" ] }, { @@ -116,12 +644,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "# top 5 less common customer locations\n", + "(customer_df.ST.value_counts()).nsmallest(5)\n" ] }, { @@ -146,12 +692,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Policies sold by type:\n", + " Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "\n", + "Most sold policy type: Personal Auto with 780 policies\n" + ] + } + ], "source": [ - "# Your code here" + "# total number of policies sold for each type of policy\n", + "policy_counts = customer_df[\"Policy Type\"].value_counts()\n", + "top_policy_type = policy_counts.idxmax()\n", + "top_policy_count = policy_counts.max()\n", + "\n", + "print(\"Policies sold by type:\\n\", policy_counts)\n", + "print(\"\\nMost sold policy type:\", top_policy_type, \"with\", top_policy_count, \"policies\")\n" ] }, { @@ -176,12 +743,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Income for Personal Auto customers: 38180.69871794872\n", + "Average Income for Corporate Auto customers: 41390.31196581197\n" + ] + } + ], "source": [ - "# Your code here" + "# if customers with Personal Auto have a lower income than those with Corporate Auto.\n", + "personal_auto_df = customer_df.loc[customer_df[\"Policy Type\"] == \"Personal Auto\"]\n", + "corporate_auto_df = customer_df.loc[customer_df[\"Policy Type\"] == \"Corporate Auto\"]\n", + "\n", + "#average income\n", + "avg_income_personal = personal_auto_df[\"Income\"].mean()\n", + "avg_income_corporate = corporate_auto_df[\"Income\"].mean()\n", + "\n", + "print(\"Average Income for Personal Auto customers:\", avg_income_personal)\n", + "print(\"Average Income for Corporate Auto customers:\", avg_income_corporate)\n" ] }, { @@ -226,20 +811,116 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Claim Amount Statistics:\n", + " count 1071.000000\n", + "mean 404.986909\n", + "std 293.027260\n", + "min 0.382107\n", + "25% 202.157702\n", + "50% 354.729129\n", + "75% 532.800000\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64\n", + "\n", + "75th Percentile (Threshold for High Claims): 532.8\n", + "\n", + "High Claim Customers Data:\n", + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "1 QZ44356 Arizona F Bachelor 697953.59% 0.0 \n", + "2 AI49188 Nevada F Bachelor 1288743.17% 48767.0 \n", + "17 OE15005 Cali NaN College 394524.16% 28855.0 \n", + "23 TZ98966 Nevada NaN Bachelor 245019.10% 0.0 \n", + "26 US89481 California NaN Bachelor 394637.21% 0.0 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "1 94.0 1/0/00 Personal Auto \n", + "2 108.0 1/0/00 Personal Auto \n", + "17 101.0 1/0/00 Personal Auto \n", + "23 73.0 1/3/00 Corporate Auto \n", + "26 111.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "17 SUV 647.442031 \n", + "23 Four-Door Car 554.376763 \n", + "26 Four-Door Car 799.200000 \n", + "\n", + "Summary Statistics for High Claim Customers:\n", + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 264 264 238 264 264 \n", + "unique 264 7 5 5 256 \n", + "top QZ44356 Oregon F Bachelor 578018.22% \n", + "freq 1 90 115 85 3 \n", + "mean NaN NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN NaN \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints \\\n", + "count 264.000000 264.000000 264 \n", + "unique NaN NaN 6 \n", + "top NaN NaN 1/0/00 \n", + "freq NaN NaN 206 \n", + "mean 23677.344697 165.193182 NaN \n", + "std 27013.483721 623.930992 NaN \n", + "min 0.000000 63.000000 NaN \n", + "25% 0.000000 99.000000 NaN \n", + "50% 18807.000000 114.000000 NaN \n", + "75% 42423.750000 133.250000 NaN \n", + "max 99316.000000 10202.000000 NaN \n", + "\n", + " Policy Type Vehicle Class Total Claim Amount \n", + "count 264 264 264.000000 \n", + "unique 3 6 NaN \n", + "top Personal Auto SUV NaN \n", + "freq 191 101 NaN \n", + "mean NaN NaN 782.228263 \n", + "std NaN NaN 292.751640 \n", + "min NaN NaN 537.600000 \n", + "25% NaN NaN 606.521741 \n", + "50% NaN NaN 679.597985 \n", + "75% NaN NaN 851.400000 \n", + "max NaN NaN 2893.239678 \n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "claim_stats = customer_df[\"Total Claim Amount\"].describe()\n", + "print(\"Total Claim Amount Statistics:\\n\", claim_stats)\n", + "\n", + "# Step 2: Find the 75th percentile\n", + "q75 = customer_df[\"Total Claim Amount\"].quantile(0.75)\n", + "\n", + "# Step 3: Filter customers with claim amounts > 75th percentile\n", + "high_claims_df = customer_df.loc[customer_df[\"Total Claim Amount\"] > q75]\n", + "\n", + "# Step 4: Summary statistics about high claim customers\n", + "high_claims_summary = high_claims_df.describe(include=\"all\")\n", + "\n", + "print(\"\\n75th Percentile (Threshold for High Claims):\", q75)\n", + "print(\"\\nHigh Claim Customers Data:\\n\", high_claims_df.head())\n", + "print(\"\\nSummary Statistics for High Claim Customers:\\n\", high_claims_summary)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -251,7 +932,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,