From 8042d0feb10870ad42c7b4c6925bd4e33404f43b Mon Sep 17 00:00:00 2001 From: Edosa Erhunmwunse Date: Thu, 13 Nov 2025 10:33:22 +0100 Subject: [PATCH] Lab hypothesis test --- lab-hypothesis-testing.ipynb | 1707 ++++++++++++++++++++++++++++++++-- 1 file changed, 1645 insertions(+), 62 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..d047a06 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 166, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 166, "metadata": {}, "output_type": "execute_result" } @@ -288,56 +288,1433 @@ "df" ] }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nametype1type2hpattackdefensesp_atksp_defspeedgenerationlegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " name type1 type2 hp attack defense sp_atk sp_def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " speed generation legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns=df.columns.str.strip().str.lower().str.replace('.', '_').str.replace(' ', '')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()\n", + "df=df.dropna().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nametype1type2hpattackdefensesp_atksp_defspeedgenerationlegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharizardFireFlying788478109851001False
....................................
409DiancieRockFairy50100150100150506True
410Mega DiancieRockFairy501601101601101106True
411Hoopa ConfinedPsychicGhost8011060150130706True
412Hoopa UnboundPsychicDark8016060170130806True
413VolcanionFireWater8011012013090706True
\n", + "

414 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " name type1 type2 hp attack defense sp_atk sp_def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charizard Fire Flying 78 84 78 109 85 \n", + ".. ... ... ... .. ... ... ... ... \n", + "409 Diancie Rock Fairy 50 100 150 100 150 \n", + "410 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "411 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "412 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "413 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " speed generation legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 100 1 False \n", + ".. ... ... ... \n", + "409 50 6 True \n", + "410 110 6 True \n", + "411 70 6 True \n", + "412 80 6 True \n", + "413 70 6 True \n", + "\n", + "[414 rows x 11 columns]" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# H0: Grp1_mean <= Grp2_mean\n", + "# H1: Grp1_mean > Grp2_mean" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 45\n", + "1 60\n", + "2 80\n", + "3 80\n", + "24 45\n", + "25 60\n", + "26 75\n", + "32 50\n", + "33 65\n", + "34 80\n", + "55 60\n", + "56 95\n", + "94 35\n", + "95 55\n", + "96 75\n", + "133 70\n", + "145 70\n", + "146 90\n", + "157 60\n", + "172 50\n", + "181 70\n", + "196 99\n", + "215 95\n", + "223 40\n", + "224 60\n", + "256 60\n", + "257 90\n", + "258 90\n", + "281 100\n", + "300 40\n", + "301 60\n", + "321 69\n", + "322 114\n", + "327 44\n", + "328 74\n", + "349 91\n", + "364 88\n", + "Name: hp, dtype: int64" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grp_B=df[df[\"type1\"]==\"Grass\"][\"hp\"]\n", + "grp_B" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76 91\n", + "183 75\n", + "184 75\n", + "201 95\n", + "202 95\n", + "207 80\n", + "208 80\n", + "209 80\n", + "210 80\n", + "212 105\n", + "213 105\n", + "245 58\n", + "246 68\n", + "247 108\n", + "248 108\n", + "352 100\n", + "353 100\n", + "356 125\n", + "357 125\n", + "358 125\n", + "408 108\n", + "Name: hp, dtype: int64" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grp_A=df[df[\"type1\"]==\"Dragon\"][\"hp\"]\n", + "grp_A" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=4.649881427485321, pvalue=2.068191387085888e-05, df=56.0)" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy import stats\n", + "# This returns the t-statistic and the p-value\n", + "stats.ttest_ind(grp_A, grp_B)" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "# H0: Grp1_mean <= Grp2_mean\n", + "# H1: Grp1_mean > Grp2_mean\n", + "## The p-value=2.06e-05 is lower than a significance level of 0.05, we fail to reject H0 and support that Grp 1 has more stats than to Grp2." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# H0: Legendary Pokemons have same stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) as Non-legendary\n", + "# H1: Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) compared to Non-legendary" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [], + "source": [ + "# Separate the groups based on the 'legendary' column\n", + "stats=[\"hp\", \"attack\", \"defense\", \"sp_atk\", \"sp_def\", \"speed\",\"legendary\"]\n", + "df_stats = df[stats]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hpattackdefensesp_atksp_defspeedlegendary
0454949656545False
1606263808060False
280828310010080False
38010012312212080False
478847810985100False
........................
4095010015010015050True
41050160110160110110True
411801106015013070True
412801606017013080True
413801101201309070True
\n", + "

414 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " hp attack defense sp_atk sp_def speed legendary\n", + "0 45 49 49 65 65 45 False\n", + "1 60 62 63 80 80 60 False\n", + "2 80 82 83 100 100 80 False\n", + "3 80 100 123 122 120 80 False\n", + "4 78 84 78 109 85 100 False\n", + ".. .. ... ... ... ... ... ...\n", + "409 50 100 150 100 150 50 True\n", + "410 50 160 110 160 110 110 True\n", + "411 80 110 60 150 130 70 True\n", + "412 80 160 60 170 130 80 True\n", + "413 80 110 120 130 90 70 True\n", + "\n", + "[414 rows x 7 columns]" + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "from statsmodels.multivariate.manova import MANOVA" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Multivariate linear model\n", + "================================================================\n", + " \n", + "----------------------------------------------------------------\n", + " Intercept Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.0530 6.0000 407.0000 1211.8954 0.0000\n", + " Pillai's trace 0.9470 6.0000 407.0000 1211.8954 0.0000\n", + " Hotelling-Lawley trace 17.8658 6.0000 407.0000 1211.8954 0.0000\n", + " Roy's greatest root 17.8658 6.0000 407.0000 1211.8954 0.0000\n", + "----------------------------------------------------------------\n", + " \n", + "----------------------------------------------------------------\n", + " legendary Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.7117 6.0000 407.0000 27.4810 0.0000\n", + " Pillai's trace 0.2883 6.0000 407.0000 27.4810 0.0000\n", + " Hotelling-Lawley trace 0.4051 6.0000 407.0000 27.4810 0.0000\n", + " Roy's greatest root 0.4051 6.0000 407.0000 27.4810 0.0000\n", + "================================================================\n", + "\n" + ] + } + ], + "source": [ + "\n", + "# Define the formula: all six stats explained by the 'legendary' status\n", + "formula = 'hp + attack + defense + sp_atk + sp_def + speed ~ legendary'\n", + "\n", + "# Run the MANOVA model\n", + "manova = MANOVA.from_formula(formula, data=df_stats)\n", + "\n", + "# Get the multivariate test results (Focus on Wilks' lambda)\n", + "manova_results = manova.mv_test()\n", + "print(manova_results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The p_value of all four test is less than 0.05 so we reject the null hypothesis\n", + "# Obtained result supports the claim that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) compared to Non-legendary " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: geopy in c:\\users\\hp\\anaconda3\\lib\\site-packages (2.4.1)\n", + "Requirement already satisfied: pandas in c:\\users\\hp\\anaconda3\\lib\\site-packages (2.3.3)\n", + "Requirement already satisfied: geographiclib<3,>=1.52 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from geopy) (2.1)\n", + "Requirement already satisfied: numpy>=1.26.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install geopy pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "# H0: Houses close to either a school or a hospital are more expensive.\n", + "# H1: houses far from either a school or a hospital are more expensive." + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "from geopy.distance import great_circle" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def calculate_great_circle_distance(row,target_point):\n", + " \n", + " # Point 1 (Start) must be (latitude, longitude)\n", + " start_point = (row['latitude'], row['longitude'])\n", + " \n", + " # Point 2 (End) must be (latitude, longitude)\n", + " end_point = target_point\n", + " \n", + " # The great_circle function returns a distance object; we extract miles/km\n", + " # Returning distance in Kilometers (you can use .miles, .meters, etc.)\n", + " distance_km = great_circle(start_point, end_point).km\n", + " return distance_km" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "school_point = (34.00, -118.00) \n", + "hospital_point = (37.00, -122.00) " + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"distance_school\"]=df.apply(calculate_great_circle_distance,axis=1, target_point=school_point)\n", + "df[\"distance_hospital\"]=df.apply(calculate_great_circle_distance,axis=1, target_point=hospital_point)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valuedistance_schooldistance_hospital
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0340.418792761.974459
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0327.659611738.576332
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0319.542519768.308921
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0319.365481770.371666
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0320.561844774.421877
....................................
16995-124.2640.5852.02217.0394.0907.0369.02.3571111400.0917.047590443.615947
16996-124.2740.6936.02349.0528.01194.0465.02.517979000.0927.102523454.929344
16997-124.3041.8417.02677.0531.01244.0456.03.0313103600.01031.462839573.239315
16998-124.3041.8019.02672.0552.01298.0478.01.979785800.01027.794143569.086288
16999-124.3540.5452.01820.0300.0806.0270.03.014794600.0918.431001443.183095
\n", + "

17000 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "... ... ... ... ... ... \n", + "16995 -124.26 40.58 52.0 2217.0 394.0 \n", + "16996 -124.27 40.69 36.0 2349.0 528.0 \n", + "16997 -124.30 41.84 17.0 2677.0 531.0 \n", + "16998 -124.30 41.80 19.0 2672.0 552.0 \n", + "16999 -124.35 40.54 52.0 1820.0 300.0 \n", + "\n", + " population households median_income median_house_value \\\n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "... ... ... ... ... \n", + "16995 907.0 369.0 2.3571 111400.0 \n", + "16996 1194.0 465.0 2.5179 79000.0 \n", + "16997 1244.0 456.0 3.0313 103600.0 \n", + "16998 1298.0 478.0 1.9797 85800.0 \n", + "16999 806.0 270.0 3.0147 94600.0 \n", + "\n", + " distance_school distance_hospital \n", + "0 340.418792 761.974459 \n", + "1 327.659611 738.576332 \n", + "2 319.542519 768.308921 \n", + "3 319.365481 770.371666 \n", + "4 320.561844 774.421877 \n", + "... ... ... \n", + "16995 917.047590 443.615947 \n", + "16996 927.102523 454.929344 \n", + "16997 1031.462839 573.239315 \n", + "16998 1027.794143 569.086288 \n", + "16999 918.431001 443.183095 \n", + "\n", + "[17000 rows x 11 columns]" + ] + }, + "execution_count": 188, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 189, "metadata": {}, "outputs": [], "source": [ - "#code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + "cond1=df[\"distance_school\"]<0.5\n", + "cond2=df[\"distance_hospital\"]<0.5" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 190, "metadata": {}, "outputs": [], "source": [ - "#code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Challenge 2**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this challenge, we will be working with california-housing data. The data can be found here:\n", - "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + "df[\"distance_category\"] = np.where(\n", + " cond1 | cond2,\"close\", \"far\" )" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 191, "metadata": {}, "outputs": [ { @@ -370,6 +1747,9 @@ " households\n", " median_income\n", " median_house_value\n", + " distance_school\n", + " distance_hospital\n", + " distance_category\n", " \n", " \n", " \n", @@ -384,6 +1764,9 @@ " 472.0\n", " 1.4936\n", " 66900.0\n", + " 340.418792\n", + " 761.974459\n", + " far\n", " \n", " \n", " 1\n", @@ -396,6 +1779,9 @@ " 463.0\n", " 1.8200\n", " 80100.0\n", + " 327.659611\n", + " 738.576332\n", + " far\n", " \n", " \n", " 2\n", @@ -408,6 +1794,9 @@ " 117.0\n", " 1.6509\n", " 85700.0\n", + " 319.542519\n", + " 768.308921\n", + " far\n", " \n", " \n", " 3\n", @@ -420,6 +1809,9 @@ " 226.0\n", " 3.1917\n", " 73400.0\n", + " 319.365481\n", + " 770.371666\n", + " far\n", " \n", " \n", " 4\n", @@ -432,53 +1824,237 @@ " 262.0\n", " 1.9250\n", " 65500.0\n", + " 320.561844\n", + " 774.421877\n", + " far\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 16995\n", + " -124.26\n", + " 40.58\n", + " 52.0\n", + " 2217.0\n", + " 394.0\n", + " 907.0\n", + " 369.0\n", + " 2.3571\n", + " 111400.0\n", + " 917.047590\n", + " 443.615947\n", + " far\n", + " \n", + " \n", + " 16996\n", + " -124.27\n", + " 40.69\n", + " 36.0\n", + " 2349.0\n", + " 528.0\n", + " 1194.0\n", + " 465.0\n", + " 2.5179\n", + " 79000.0\n", + " 927.102523\n", + " 454.929344\n", + " far\n", + " \n", + " \n", + " 16997\n", + " -124.30\n", + " 41.84\n", + " 17.0\n", + " 2677.0\n", + " 531.0\n", + " 1244.0\n", + " 456.0\n", + " 3.0313\n", + " 103600.0\n", + " 1031.462839\n", + " 573.239315\n", + " far\n", + " \n", + " \n", + " 16998\n", + " -124.30\n", + " 41.80\n", + " 19.0\n", + " 2672.0\n", + " 552.0\n", + " 1298.0\n", + " 478.0\n", + " 1.9797\n", + " 85800.0\n", + " 1027.794143\n", + " 569.086288\n", + " far\n", + " \n", + " \n", + " 16999\n", + " -124.35\n", + " 40.54\n", + " 52.0\n", + " 1820.0\n", + " 300.0\n", + " 806.0\n", + " 270.0\n", + " 3.0147\n", + " 94600.0\n", + " 918.431001\n", + " 443.183095\n", + " far\n", " \n", " \n", "\n", + "

17000 rows × 12 columns

\n", "" ], "text/plain": [ - " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", - "0 -114.31 34.19 15.0 5612.0 1283.0 \n", - "1 -114.47 34.40 19.0 7650.0 1901.0 \n", - "2 -114.56 33.69 17.0 720.0 174.0 \n", - "3 -114.57 33.64 14.0 1501.0 337.0 \n", - "4 -114.57 33.57 20.0 1454.0 326.0 \n", + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "... ... ... ... ... ... \n", + "16995 -124.26 40.58 52.0 2217.0 394.0 \n", + "16996 -124.27 40.69 36.0 2349.0 528.0 \n", + "16997 -124.30 41.84 17.0 2677.0 531.0 \n", + "16998 -124.30 41.80 19.0 2672.0 552.0 \n", + "16999 -124.35 40.54 52.0 1820.0 300.0 \n", "\n", - " population households median_income median_house_value \n", - "0 1015.0 472.0 1.4936 66900.0 \n", - "1 1129.0 463.0 1.8200 80100.0 \n", - "2 333.0 117.0 1.6509 85700.0 \n", - "3 515.0 226.0 3.1917 73400.0 \n", - "4 624.0 262.0 1.9250 65500.0 " + " population households median_income median_house_value \\\n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "... ... ... ... ... \n", + "16995 907.0 369.0 2.3571 111400.0 \n", + "16996 1194.0 465.0 2.5179 79000.0 \n", + "16997 1244.0 456.0 3.0313 103600.0 \n", + "16998 1298.0 478.0 1.9797 85800.0 \n", + "16999 806.0 270.0 3.0147 94600.0 \n", + "\n", + " distance_school distance_hospital distance_category \n", + "0 340.418792 761.974459 far \n", + "1 327.659611 738.576332 far \n", + "2 319.542519 768.308921 far \n", + "3 319.365481 770.371666 far \n", + "4 320.561844 774.421877 far \n", + "... ... ... ... \n", + "16995 917.047590 443.615947 far \n", + "16996 927.102523 454.929344 far \n", + "16997 1031.462839 573.239315 far \n", + "16998 1027.794143 569.086288 far \n", + "16999 918.431001 443.183095 far \n", + "\n", + "[17000 rows x 12 columns]" ] }, - "execution_count": 5, + "execution_count": 191, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "df" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 192, "metadata": {}, + "outputs": [], "source": [ - "**We posit that houses close to either a school or a hospital are more expensive.**\n", - "\n", - "- School coordinates (-118, 34)\n", - "- Hospital coordinates (-122, 37)\n", - "\n", - "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", - "\n", - "Hint:\n", - "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", - "- Divide your dataset into houses close and far from either a hospital or school.\n", - "- Choose the propper test and, with 5% significance, comment your findings.\n", - " " + "# H0: avg_price_closer <= avg_price_far.\n", + "# H1: avg_price_closer > avg_price_far." + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [], + "source": [ + "## If the average price of houses closer to schools and hospitals is higher than houses far from these places then we support that they are expensive." + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [], + "source": [ + "df_close=df[df[\"distance_category\"]==\"close\"][\"median_house_value\"]\n", + "df_far=df[df[\"distance_category\"]==\"far\"][\"median_house_value\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13747 137500.0\n", + "Name: median_house_value, dtype: float64" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_close" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=-0.6018226451778325, pvalue=0.7263498868187508, df=16998.0)" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy import stats\n", + "# This returns the t-statistic and the p-value\n", + "stats.ttest_ind(df_close, df_far,alternative=\"greater\")" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "## The P-value is higher than 0.05 so we do not reject HO and support claim that average price for houses closer to school and houses --\n", + "## -- are not more expensive." ] }, { @@ -488,6 +2064,13 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -498,7 +2081,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -512,9 +2095,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.7" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }