From b54d685b9b602c38b22ecd51f7bf75648f8f225f Mon Sep 17 00:00:00 2001 From: Camilla Scandola <103769428+camilla-scandola@users.noreply.github.com> Date: Sun, 23 Nov 2025 23:12:21 +0100 Subject: [PATCH 1/2] solved --- .../lab-hypothesis-testing-checkpoint.ipynb | 520 ++++++++++++++++++ lab-hypothesis-testing.ipynb | 156 +++++- 2 files changed, 657 insertions(+), 19 deletions(-) create mode 100644 .ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb new file mode 100644 index 0000000..0cc26d5 --- /dev/null +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#code here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "#code here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..9bd963a 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -45,13 +45,12 @@ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -278,7 +277,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +296,42 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83.3125\n", + "67.27142857142857\n", + "t-statistic: 3.3349632905124063\n", + "One-sided p-value (Dragon > Grass): 0.0007993609745420599\n", + "Reject H0: Dragons have higher HP at 5% level\n" + ] + } + ], "source": [ - "#code here" + "# Filter the two groups \n", + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n", + "\n", + "# Welch’s t-test (unequal variances), one-sided\n", + "t_stat, p_value_two_sided = ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "# Convert to one-sided p-value (Dragon > Grass)\n", + "p_value = p_value_two_sided / 2 if t_stat > 0 else 1 - p_two_sided / 2\n", + "\n", + "print(dragon_hp.mean())\n", + "print(grass_hp.mean())\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"One-sided p-value (Dragon > Grass):\", p_value)\n", + "\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print(\"Reject H0: Dragons have higher HP at 5% level\")\n", + "else:\n", + " print(\"Fail to reject H0: No evidence Dragons have higher HP\")" ] }, { @@ -313,11 +343,54 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Legendary'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'Legendary'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[32], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Two groups\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_legendary \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLegendary\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m]\n\u001b[1;32m 3\u001b[0m df_nonlegendary \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLegendary\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m]\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Stats to compare\u001b[39;00m\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'Legendary'" + ] + } + ], "source": [ - "#code here" + "# Two groups\n", + "df_legendary = df[df[\"Legendary\"] == True]\n", + "df_nonlegendary = df[df[\"Legendary\"] == False]\n", + "\n", + "# Stats to compare\n", + "stats_list = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "alpha = 0.05\n", + "\n", + "for stat in stats_list:\n", + " leg = df_legendary[stat]\n", + " non = df_nonlegendary[stat]\n", + "\n", + " f_stat, p_value = st.f_oneway(leg, non)\n", + " \n", + " print(f\"{stat}:\")\n", + " print(f\"F-statistic: {f_stat:.4f}, p-value: {p_value:.8f}\")\n", + "\n", + " if p_value < alpha:\n", + " print(\"Reject H0: Legendary and Non-Legendary differ in this stat\")\n", + " else:\n", + " print(\"Fail to reject H0: No difference detected\")" ] }, { @@ -337,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -453,7 +526,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -483,10 +556,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean close: 246951.98213501245\n", + "Mean far: 180678.44105790975\n", + "t-statistic: 37.992330214201516\n", + "One-sided p-value (close > far): 1.5032478884296307e-301\n", + "Reject H0: Houses close to school or hospital are more expensive at 5% level\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# School and hospital coordinates\n", + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "def euclidean_dist(x, y, x0, y0):\n", + " return np.sqrt((x - x0)**2 + (y - y0)**2)\n", + "\n", + "df[\"dist_school\"] = euclidean_dist(df[\"longitude\"], df[\"latitude\"], school[0], school[1])\n", + "df[\"dist_hospital\"] = euclidean_dist(df[\"longitude\"], df[\"latitude\"], hospital[0], hospital[1])\n", + "\n", + "df[\"close\"] = ((df[\"dist_school\"] < 0.50) | (df[\"dist_hospital\"] < 0.50))\n", + "\n", + "close_prices = df[df[\"close\"] == True][\"median_house_value\"]\n", + "far_prices = df[df[\"close\"] == False][\"median_house_value\"]\n", + "\n", + "from scipy.stats import ttest_ind\n", + "\n", + "t_stat, p_two = ttest_ind(close_prices, far_prices, equal_var=False)\n", + "p_one = p_two / 2 if t_stat > 0 else 1 - p_two / 2\n", + "\n", + "print(\"Mean close:\", close_prices.mean())\n", + "print(\"Mean far:\", far_prices.mean())\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"One-sided p-value (close > far):\", p_one)\n", + "\n", + "alpha = 0.05\n", + "if p_one < alpha:\n", + " print(\"Reject H0: Houses close to school or hospital are more expensive at 5% level\")\n", + "else:\n", + " print(\"Fail to reject H0: No evidence close houses are more expensive\")" + ] }, { "cell_type": "code", @@ -498,9 +616,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -512,9 +630,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From a5e2851aefa4ccd665a5a57b968aaa452b7bd6d4 Mon Sep 17 00:00:00 2001 From: Camilla Scandola <103769428+camilla-scandola@users.noreply.github.com> Date: Sun, 23 Nov 2025 23:14:07 +0100 Subject: [PATCH 2/2] solved lab --- .../lab-hypothesis-testing-checkpoint.ipynb | 159 +++++++++++++++--- lab-hypothesis-testing.ipynb | 41 ++--- 2 files changed, 162 insertions(+), 38 deletions(-) diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb index 0cc26d5..7c41476 100644 --- a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -45,13 +45,12 @@ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -278,7 +277,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +296,42 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83.3125\n", + "67.27142857142857\n", + "t-statistic: 3.3349632905124063\n", + "One-sided p-value (Dragon > Grass): 0.0007993609745420599\n", + "Reject H0: Dragons have higher HP at 5% level\n" + ] + } + ], "source": [ - "#code here" + "# Filter the two groups \n", + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n", + "\n", + "# Welch’s t-test (unequal variances), one-sided\n", + "t_stat, p_value_two_sided = ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "# Convert to one-sided p-value (Dragon > Grass)\n", + "p_value = p_value_two_sided / 2 if t_stat > 0 else 1 - p_two_sided / 2\n", + "\n", + "print(dragon_hp.mean())\n", + "print(grass_hp.mean())\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"One-sided p-value (Dragon > Grass):\", p_value)\n", + "\n", + "alpha = 0.05\n", + "if p_value < alpha:\n", + " print(\"Reject H0: Dragons have higher HP at 5% level\")\n", + "else:\n", + " print(\"Fail to reject H0: No evidence Dragons have higher HP\")" ] }, { @@ -313,11 +343,57 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP:\n", + "F-statistic: 64.5793, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Attack:\n", + "F-statistic: 108.1043, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Defense:\n", + "F-statistic: 51.5702, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Sp. Atk:\n", + "F-statistic: 201.3960, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Sp. Def:\n", + "F-statistic: 121.8319, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Speed:\n", + "F-statistic: 95.3598, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n" + ] + } + ], "source": [ - "#code here" + "# Two groups\n", + "df_legendary = df[df[\"Legendary\"] == True]\n", + "df_nonlegendary = df[df[\"Legendary\"] == False]\n", + "\n", + "# Stats to compare\n", + "stats_list = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "alpha = 0.05\n", + "\n", + "for stat in stats_list:\n", + " leg = df_legendary[stat]\n", + " non = df_nonlegendary[stat]\n", + "\n", + " f_stat, p_value = st.f_oneway(leg, non)\n", + " \n", + " print(f\"{stat}:\")\n", + " print(f\"F-statistic: {f_stat:.4f}, p-value: {p_value:.8f}\")\n", + "\n", + " if p_value < alpha:\n", + " print(\"Reject H0: Legendary and Non-Legendary differ in this stat\")\n", + " else:\n", + " print(\"Fail to reject H0: No difference detected\")" ] }, { @@ -337,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -453,7 +529,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -483,10 +559,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean close: 246951.98213501245\n", + "Mean far: 180678.44105790975\n", + "t-statistic: 37.992330214201516\n", + "One-sided p-value (close > far): 1.5032478884296307e-301\n", + "Reject H0: Houses close to school or hospital are more expensive at 5% level\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# School and hospital coordinates\n", + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "def euclidean_dist(x, y, x0, y0):\n", + " return np.sqrt((x - x0)**2 + (y - y0)**2)\n", + "\n", + "df[\"dist_school\"] = euclidean_dist(df[\"longitude\"], df[\"latitude\"], school[0], school[1])\n", + "df[\"dist_hospital\"] = euclidean_dist(df[\"longitude\"], df[\"latitude\"], hospital[0], hospital[1])\n", + "\n", + "df[\"close\"] = ((df[\"dist_school\"] < 0.50) | (df[\"dist_hospital\"] < 0.50))\n", + "\n", + "close_prices = df[df[\"close\"] == True][\"median_house_value\"]\n", + "far_prices = df[df[\"close\"] == False][\"median_house_value\"]\n", + "\n", + "from scipy.stats import ttest_ind\n", + "\n", + "t_stat, p_two = ttest_ind(close_prices, far_prices, equal_var=False)\n", + "p_one = p_two / 2 if t_stat > 0 else 1 - p_two / 2\n", + "\n", + "print(\"Mean close:\", close_prices.mean())\n", + "print(\"Mean far:\", far_prices.mean())\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"One-sided p-value (close > far):\", p_one)\n", + "\n", + "alpha = 0.05\n", + "if p_one < alpha:\n", + " print(\"Reject H0: Houses close to school or hospital are more expensive at 5% level\")\n", + "else:\n", + " print(\"Fail to reject H0: No evidence close houses are more expensive\")" + ] }, { "cell_type": "code", @@ -498,9 +619,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -512,9 +633,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 9bd963a..7c41476 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -343,28 +343,31 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "'Legendary'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'Legendary'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[32], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Two groups\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_legendary \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLegendary\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m]\n\u001b[1;32m 3\u001b[0m df_nonlegendary \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLegendary\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m]\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Stats to compare\u001b[39;00m\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m/opt/anaconda3/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'Legendary'" + "name": "stdout", + "output_type": "stream", + "text": [ + "HP:\n", + "F-statistic: 64.5793, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Attack:\n", + "F-statistic: 108.1043, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Defense:\n", + "F-statistic: 51.5702, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Sp. Atk:\n", + "F-statistic: 201.3960, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Sp. Def:\n", + "F-statistic: 121.8319, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n", + "Speed:\n", + "F-statistic: 95.3598, p-value: 0.00000000\n", + "Reject H0: Legendary and Non-Legendary differ in this stat\n" ] } ],