From 08cf36e97587fe9a823f77f8afda8a9164438489 Mon Sep 17 00:00:00 2001 From: medilin Date: Mon, 25 Aug 2025 23:47:55 +0200 Subject: [PATCH] solved lab --- lab-hypothesis-testing.ipynb | 455 +++++++++++++++++++++++++++++++++-- 1 file changed, 437 insertions(+), 18 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..112139f 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -40,6 +40,43 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting pandas\n", + " Using cached pandas-2.3.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)\n", + "Collecting scipy\n", + " Using cached scipy-1.16.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)\n", + "Collecting numpy\n", + " Using cached numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib/python3.13/site-packages (from pandas) (2.9.0.post0)\n", + "Collecting pytz>=2020.1 (from pandas)\n", + " Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", + "Collecting tzdata>=2022.7 (from pandas)\n", + " Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", + "Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", + "Using cached pandas-2.3.2-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)\n", + "Using cached scipy-1.16.1-cp313-cp313-macosx_14_0_arm64.whl (20.8 MB)\n", + "Using cached numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl (5.1 MB)\n", + "Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", + "Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)\n", + "Installing collected packages: pytz, tzdata, numpy, scipy, pandas\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5/5\u001b[0m [pandas]2m4/5\u001b[0m [pandas]\n", + "\u001b[1A\u001b[2KSuccessfully installed numpy-2.3.2 pandas-2.3.2 pytz-2025.2 scipy-1.16.1 tzdata-2025.2\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install pandas scipy numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "#libraries\n", @@ -51,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -278,7 +315,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +334,57 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df[df['Type 1'] == 'Dragon']['HP'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=np.float64(2.7001863481114143), pvalue=np.float64(0.9954662680969679), df=np.float64(58.0))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#code here" + "# H0: Dragon mu HP >= Grass mu HP\n", + "# H1: Dragon mu HP < Grass mu HP\n", + "# We consider a sample of 40 Pokemons from each type\n", + "# Significance level = 0.05\n", + "\n", + "dragon = df[df['Type 1'] == 'Dragon']['HP'].sample(30)\n", + "grass = df[df['Type 1'] == 'Grass']['HP'].sample(30)\n", + "\n", + "st.ttest_ind(dragon, grass, alternative='less')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are not able to reject H0 hypothesis: no significant evidence" ] }, { @@ -313,11 +396,40 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=array([-6.16755672, -6.16500997, -4.60731302, -7.41991116, -7.3658484 ,\n", + " -5.41183674]), pvalue=array([2.87308611e-08, 2.90433066e-08, 1.56437783e-05, 1.23935302e-10,\n", + " 1.57405763e-10, 6.65564558e-07]), df=array([78., 78., 78., 78., 78., 78.]))" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#code here" + "# H0: Legendary mu stats = Non Legendary mu stats\n", + "# H1: Legendary mu stats != Non Legendary mu stats\n", + "# We consider a sample of 40 Pokemons from each type\n", + "# Significant level = 0.05\n", + "\n", + "non_legendary = df.loc[df['Legendary'] == False, ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sample(40)\n", + "\n", + "legendary = df.loc[df['Legendary'] == True, ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sample(40)\n", + "\n", + "st.ttest_ind(non_legendary, legendary, alternative='two-sided')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We reject the null hypothesis because all the p-values are lower than the significance level" ] }, { @@ -337,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -453,7 +565,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -483,22 +595,329 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valuedistance_to_hospitaldistance_to_schoolclose_to_services
0-114.3134.1915.05612.01283.01015.0472.01.493666900.08.1873193.694888False
1-114.4734.4019.07650.01901.01129.0463.01.820080100.07.9662353.552591False
2-114.5633.6917.0720.0174.0333.0117.01.650985700.08.1430773.453940False
3-114.5733.6414.01501.0337.0515.0226.03.191773400.08.1544163.448840False
4-114.5733.5720.01454.0326.0624.0262.01.925065500.08.1835083.456848False
.......................................
16995-124.2640.5852.02217.0394.0907.0369.02.3571111400.04.2336759.082070False
16996-124.2740.6936.02349.0528.01194.0465.02.517979000.04.3323209.168915False
16997-124.3041.8417.02677.0531.01244.0456.03.0313103600.05.35869410.057614False
16998-124.3041.8019.02672.0552.01298.0478.01.979785800.05.32259310.026465False
16999-124.3540.5452.01820.0300.0806.0270.03.014794600.04.2490129.115597False
\n", + "

17000 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "... ... ... ... ... ... \n", + "16995 -124.26 40.58 52.0 2217.0 394.0 \n", + "16996 -124.27 40.69 36.0 2349.0 528.0 \n", + "16997 -124.30 41.84 17.0 2677.0 531.0 \n", + "16998 -124.30 41.80 19.0 2672.0 552.0 \n", + "16999 -124.35 40.54 52.0 1820.0 300.0 \n", + "\n", + " population households median_income median_house_value \\\n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "... ... ... ... ... \n", + "16995 907.0 369.0 2.3571 111400.0 \n", + "16996 1194.0 465.0 2.5179 79000.0 \n", + "16997 1244.0 456.0 3.0313 103600.0 \n", + "16998 1298.0 478.0 1.9797 85800.0 \n", + "16999 806.0 270.0 3.0147 94600.0 \n", + "\n", + " distance_to_hospital distance_to_school close_to_services \n", + "0 8.187319 3.694888 False \n", + "1 7.966235 3.552591 False \n", + "2 8.143077 3.453940 False \n", + "3 8.154416 3.448840 False \n", + "4 8.183508 3.456848 False \n", + "... ... ... ... \n", + "16995 4.233675 9.082070 False \n", + "16996 4.332320 9.168915 False \n", + "16997 5.358694 10.057614 False \n", + "16998 5.322593 10.026465 False \n", + "16999 4.249012 9.115597 False \n", + "\n", + "[17000 rows x 12 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the hospital coordinates\n", + "hospital_lon, hospital_lat = -122.0, 37.0\n", + "\n", + "# Define the school coordinates\n", + "school_lon, school_lat = -118.0, 34.0\n", + "\n", + "# Compute Euclidean distance to hospital and assign to new column\n", + "df['distance_to_hospital'] = np.sqrt(\n", + " (df.longitude - hospital_lon)**2 +\n", + " (df.latitude - hospital_lat)**2\n", + ")\n", + "\n", + "# Compute Euclidean distance to school and assign to new column\n", + "df['distance_to_school'] = np.sqrt(\n", + " (df.longitude - school_lon)**2 +\n", + " (df.latitude - school_lat)**2\n", + ")\n", + "\n", + "# Create the boolean column\n", + "df[\"close_to_services\"] = (\n", + " (df[\"distance_to_hospital\"] < 0.5) |\n", + " (df[\"distance_to_school\"] < 0.5)\n", + ")\n", + "\n", + "df" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=np.float64(2.97754414727049), pvalue=np.float64(0.9980651161168688), df=np.float64(78.0))" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# H0: houses close to either a school or a hospital price > houses far from either a school or a hospital\n", + "# H1: houses close to either a school or a hospital price <= houses far from either a school or a hospital\n", + "# Significant level = 0.05\n", + "\n", + "far = df[df['close_to_services'] == False]['median_house_value'].sample(40)\n", + "close = df[df['close_to_services'] == True]['median_house_value'].sample(40)\n", + "\n", + "st.ttest_ind(close, far, alternative='less')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we fail to reject the null hypothesis because the p-value is greater than the significant level" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -512,7 +931,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.7" } }, "nbformat": 4,