From d65953a582ded8458e731e29c6a14304bcb97b36 Mon Sep 17 00:00:00 2001 From: davherdel Date: Sat, 16 Aug 2025 15:37:19 +0100 Subject: [PATCH] Uploaded finished notebook --- lab-dw-pandas.ipynb | 696 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 681 insertions(+), 15 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..5ec019ad3 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -80,14 +80,533 @@ "- Compute summary statistics for categorical columns and providing your conclusions based on these statistics." ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1222f185", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "68a61b80", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Loading the data\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n", + "df = pd.read_csv(url)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a1eac870", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',\n", + " 'Income', 'Monthly Premium Auto', 'Number of Open Complaints',\n", + " 'Policy Type', 'Vehicle Class', 'Total Claim Amount'],\n", + " dtype='object')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, { "cell_type": "code", "execution_count": null, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape: (4008, 11)\n", + "\n", + "Data types:\n", + " Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object\n", + "\n", + "Unique value counts per column:\n", + " Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64\n", + "\n", + "Categorical columns: ['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value', 'Number of Open Complaints', 'Policy Type', 'Vehicle Class']\n", + "\n", + "Customer -> uniques: ['RB50392' 'QZ44356' 'AI49188' ... 'CW49887' 'MY31220' nan]\n", + "\n", + "ST -> uniques: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon' 'Cali' 'AZ' 'WA'\n", + " nan]\n", + "\n", + "GENDER -> uniques: [nan 'F' 'M' 'Femal' 'Male' 'female']\n", + "\n", + "Education -> uniques: ['Master' 'Bachelor' 'High School or Below' 'College' 'Bachelors' 'Doctor'\n", + " nan]\n", + "\n", + "Customer Lifetime Value -> uniques: [nan '697953.59%' '1288743.17%' ... '2031499.76%' '323912.47%'\n", + " '899704.02%']\n", + "\n", + "Number of Open Complaints -> uniques: ['1/0/00' '1/2/00' '1/1/00' '1/3/00' '1/5/00' '1/4/00' nan]\n", + "\n", + "Policy Type -> uniques: ['Personal Auto' 'Corporate Auto' 'Special Auto' nan]\n", + "\n", + "Vehicle Class -> uniques: ['Four-Door Car' 'Two-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'\n", + " 'Luxury Car' nan]\n", + "\n", + "Numeric ranges:\n", + "Income: 0.0 to 99960.0\n", + "Monthly Premium Auto: 61.0 to 35354.0\n", + "Total Claim Amount: 0.382107 to 2893.239678\n", + "\n", + "Numeric summary:\n", + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678\n", + "\n", + "Median:\n", + " Income 36234.000000\n", + "Monthly Premium Auto 83.000000\n", + "Total Claim Amount 354.729129\n", + "dtype: float64\n", + "\n", + "Mode:\n", + " Income 0.0\n", + "Monthly Premium Auto 65.0\n", + "Total Claim Amount 321.6\n", + "Name: 0, dtype: float64\n", + "\n", + "Customer value counts:\n", + " Customer\n", + "RB50392 1\n", + "HJ15383 1\n", + "AO74776 1\n", + "HQ82233 1\n", + "OL72737 1\n", + " ..\n", + "RO26085 1\n", + "ES57969 1\n", + "JK55587 1\n", + "RN97635 1\n", + "MY31220 1\n", + "Name: count, Length: 1071, dtype: int64\n", + "Top value: AA71604\n", + "\n", + "ST value counts:\n", + " ST\n", + "Oregon 320\n", + "California 211\n", + "Arizona 186\n", + "Cali 120\n", + "Nevada 98\n", + "Washington 81\n", + "WA 30\n", + "AZ 25\n", + "Name: count, dtype: int64\n", + "Top value: Oregon\n", + "\n", + "GENDER value counts:\n", + " GENDER\n", + "F 457\n", + "M 413\n", + "Male 39\n", + "female 28\n", + "Femal 17\n", + "Name: count, dtype: int64\n", + "Top value: F\n", + "\n", + "Education value counts:\n", + " Education\n", + "Bachelor 324\n", + "College 313\n", + "High School or Below 296\n", + "Master 94\n", + "Doctor 37\n", + "Bachelors 7\n", + "Name: count, dtype: int64\n", + "Top value: Bachelor\n", + "\n", + "Customer Lifetime Value value counts:\n", + " Customer Lifetime Value\n", + "445811.34% 4\n", + "251459.20% 4\n", + "272535.64% 3\n", + "578018.22% 3\n", + "684615.03% 3\n", + " ..\n", + "245357.08% 1\n", + "507566.27% 1\n", + "321497.94% 1\n", + "1227534.31% 1\n", + "899704.02% 1\n", + "Name: count, Length: 1027, dtype: int64\n", + "Top value: 251459.20%\n", + "\n", + "Number of Open Complaints value counts:\n", + " Number of Open Complaints\n", + "1/0/00 830\n", + "1/1/00 138\n", + "1/2/00 50\n", + "1/3/00 34\n", + "1/4/00 13\n", + "1/5/00 6\n", + "Name: count, dtype: int64\n", + "Top value: 1/0/00\n", + "\n", + "Policy Type value counts:\n", + " Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "Top value: Personal Auto\n", + "\n", + "Vehicle Class value counts:\n", + " Vehicle Class\n", + "Four-Door Car 576\n", + "Two-Door Car 205\n", + "SUV 199\n", + "Sports Car 57\n", + "Luxury SUV 20\n", + "Luxury Car 14\n", + "Name: count, dtype: int64\n", + "Top value: Four-Door Car\n" + ] + } + ], "source": [ - "# Your code here" + "# Quick look at basic info\n", + "print(\"Shape:\", df.shape)\n", + "print(\"\\nData types:\\n\", df.dtypes)\n", + "\n", + "# maybe check if some columns need fixing\n", + "# if 'Customer Lifetime Value' looks weird, convert it:\n", + "# df['Customer Lifetime Value'] = pd.to_numeric(df['Customer Lifetime Value'], errors='coerce')\n", + "\n", + "# Unique counts\n", + "print(\"\\nUnique value counts per column:\\n\", df.nunique())\n", + "\n", + "# Find object-type cols (likely categoricals)\n", + "cat_cols = df.select_dtypes(include=\"object\").columns\n", + "print(\"\\nCategorical columns:\", list(cat_cols))\n", + "\n", + "# Peek at their unique values (can get messy if many)\n", + "for c in cat_cols:\n", + " print(f\"\\n{c} -> uniques:\", df[c].unique())\n", + "\n", + "# Ranges for numeric cols\n", + "num_cols = df.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "print(\"\\nNumeric ranges:\")\n", + "for c in num_cols:\n", + " print(f\"{c}: {df[c].min()} to {df[c].max()}\")\n", + "\n", + "# Summary stats\n", + "print(\"\\nNumeric summary:\\n\", df[num_cols].describe())\n", + "\n", + "# Extra quick stats\n", + "print(\"\\nMedian:\\n\", df[num_cols].median())\n", + "print(\"\\nMode:\\n\", df[num_cols].mode().iloc[0])\n", + "\n", + "# Frequency for categories\n", + "for c in cat_cols:\n", + " print(f\"\\n{c} value counts:\\n\", df[c].value_counts())\n", + " print(\"Top value:\", df[c].mode()[0])\n" ] }, { @@ -116,12 +635,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Value counts for ST:\n", + " ST\n", + "Oregon 320\n", + "California 211\n", + "Arizona 186\n", + "Cali 120\n", + "Nevada 98\n", + "Washington 81\n", + "WA 30\n", + "AZ 25\n", + "Name: count, dtype: int64\n", + "\n", + "5 least common states (ascending):\n", + " ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64\n" + ] + } + ], "source": [ - "# Your code here" + "if 'ST' in df.columns:\n", + " st_counts = df['ST'].value_counts()\n", + " \n", + " # quick peek\n", + " print(\"\\nValue counts for ST:\\n\", st_counts)\n", + " \n", + " # least common 5\n", + " print(\"\\n5 least common states (ascending):\\n\", st_counts.sort_values().head(5))\n", + "\n" ] }, { @@ -146,12 +701,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total policies sold by type:\n", + " Policy Type\n", + "Personal Auto 780\n", + "Corporate Auto 234\n", + "Special Auto 57\n", + "Name: count, dtype: int64\n", + "\n", + "Most sold policy type: Personal Auto with 780 policies\n" + ] + } + ], "source": [ - "# Your code here" + "# Policy Type analysis\n", + "if 'Policy Type' in df.columns:\n", + " policy_counts = df['Policy Type'].value_counts()\n", + " print(\"\\nTotal policies sold by type:\\n\", policy_counts)\n", + " \n", + " # which one is the top seller?\n", + " top_policy = policy_counts.idxmax()\n", + " print(\"\\nMost sold policy type:\", top_policy, \"with\", policy_counts.max(), \"policies\")\n", + "else:\n", + " print(\"\\n 'Policy Type' column not found. Check column names.\")\n" ] }, { @@ -176,12 +756,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Number of Personal Auto policies: 780\n", + "Number of Corporate Auto policies: 234\n", + "\n", + "Average income for Personal Auto: 38180.7\n", + "Average income for Corporate Auto: 41390.31\n", + "Customers with Personal Auto have a lower average income.\n" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "# Compare average income: Personal Auto vs Corporate Auto\n", + "if 'Policy Type' in df.columns and 'Income' in df.columns:\n", + " \n", + " personal_df = df.loc[df['Policy Type'] == 'Personal Auto']\n", + " corporate_df = df.loc[df['Policy Type'] == 'Corporate Auto']\n", + " \n", + " # quick sanity check on counts\n", + " print(\"\\nNumber of Personal Auto policies:\", len(personal_df))\n", + " print(\"Number of Corporate Auto policies:\", len(corporate_df))\n", + " \n", + " avg_personal = personal_df['Income'].mean()\n", + " avg_corporate = corporate_df['Income'].mean()\n", + " \n", + " print(\"\\nAverage income for Personal Auto:\", round(avg_personal, 2))\n", + " print(\"Average income for Corporate Auto:\", round(avg_corporate, 2))\n", + " \n", + " if avg_personal < avg_corporate:\n", + " print(\"Customers with Personal Auto have a lower average income.\")\n", + " elif avg_personal > avg_corporate:\n", + " print(\"Customers with Personal Auto have a higher average income.\")\n", + " else:\n", + " print(\"Average incomes are the same.\")\n", + "else:\n", + " print(\"\\n Required columns not found. Check column names.\")\n" ] }, { @@ -229,15 +848,62 @@ "execution_count": null, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Summary stats for Total Claim Amount:\n", + " count 1071.000000\n", + "mean 404.986909\n", + "std 293.027260\n", + "min 0.382107\n", + "25% 202.157702\n", + "50% 354.729129\n", + "75% 532.800000\n", + "max 2893.239678\n", + "Name: Total Claim Amount, dtype: float64\n", + "\n", + "75th percentile value: 532.8\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'cutoff_75_' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[12], line 14\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m75th percentile value:\u001b[39m\u001b[38;5;124m\"\u001b[39m, cutoff_75)\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# filter top 25% customers\u001b[39;00m\n\u001b[1;32m---> 14\u001b[0m high_claim_df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mloc[df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTotal Claim Amount\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m>\u001b[39m cutoff_75_]()\n", + "\u001b[1;31mNameError\u001b[0m: name 'cutoff_75_' is not defined" + ] + } + ], "source": [ - "# Your code here" + "# Your code here\n", + "\n", + "# high claim amount analysis\n", + "if 'Total Claim Amount' in df.columns:\n", + " \n", + " # quick look at stats for context\n", + " print(\"\\nSummary stats for Total Claim Amount:\\n\", df['Total Claim Amount'].describe())\n", + " \n", + " # 75th percentile cutoff\n", + " cutoff_75 = df['Total Claim Amount'].quantile(0.75)\n", + " print(\"\\n75th percentile value:\", cutoff_75)\n", + " \n", + " # filter top 25% customers\n", + " high_claim_df = df.loc[df['Total Claim Amount'] > cutoff_75_]()\n", + "\n", + "\n", + "#I am not entire sure I got this right, but I think it should be something like this" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -251,7 +917,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,