diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb new file mode 100644 index 0000000..6fe396f --- /dev/null +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-statistic: 3.3349632905124063\n", + "p-value: 0.0007993609745420597\n" + ] + } + ], + "source": [ + "#Set the hypothesis\n", + "\n", + "#H0: mean Dragon HP = mean Grass HP\n", + "#H1: mean Dragon HP > mean Grass HP\n", + "\n", + "alpha = 0.05\n", + "\n", + "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n", + "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]\n", + "\n", + "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"p-value:\", p_value / 2)\n", + "\n", + "# At the 5% significance level, we expect to reject the null hypothesis and conclude that Dragon-type Pokémon have, on average, significantly higher HP than Grass-type Pokémon." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP: t=8.98, p=0.00000, Legendary mean=92.74, Non-Legendary mean=67.18\n", + "Attack: t=10.44, p=0.00000, Legendary mean=116.68, Non-Legendary mean=75.67\n", + "Defense: t=7.64, p=0.00000, Legendary mean=99.66, Non-Legendary mean=71.56\n", + "Sp. Atk: t=13.42, p=0.00000, Legendary mean=122.18, Non-Legendary mean=68.45\n", + "Sp. Def: t=10.02, p=0.00000, Legendary mean=105.94, Non-Legendary mean=68.89\n", + "Speed: t=11.48, p=0.00000, Legendary mean=100.18, Non-Legendary mean=65.46\n" + ] + } + ], + "source": [ + "# Hypotheses\n", + "# H0: Legendary and Non-Legendary Pokémon have the same distribution of stats.\n", + "# H1: At least one stat differs significantly between Legendary and Non-Legendary Pokémon.\n", + "\n", + "stats_cols = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "# Run Welch's t-test for each stat\n", + "for col in stats_cols:\n", + " legendary = df[df[\"Legendary\"] == True][col]\n", + " nonlegendary = df[df[\"Legendary\"] == False][col]\n", + " \n", + " t_stat, p_value = st.ttest_ind(legendary, nonlegendary, equal_var=False)\n", + " print(f\"{col}: t={t_stat:.2f}, p={p_value:.5f}, \"\n", + " f\"Legendary mean={legendary.mean():.2f}, Non-Legendary mean={nonlegendary.mean():.2f}\")\n", + "\n", + "# Legendary Pokémon usually have higher base stats across the board.\n", + "# There are significant differences in all six stats, with very small p-values." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hypothesis\n", + "# We want to test whether houses close to a school or hospital (distance < 0.5) are more expensive.\n", + "\n", + "# H0: Mean house value (median_house_value) of close houses = mean of far houses.\n", + "# H1: Mean house value of close houses > mean of far houses." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "def euclidean_distance(lon, lat, ref):\n", + " return np.sqrt((lon - ref[0])**2 + (lat - ref[1])**2)\n", + "\n", + "# Distances\n", + "df[\"dist_school\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], school)\n", + "df[\"dist_hospital\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], hospital)\n", + "\n", + "# Close if near either school or hospital\n", + "df[\"close\"] = ((df[\"dist_school\"] < 0.50) | (df[\"dist_hospital\"] < 0.50))\n", + "\n", + "# Groups\n", + "close_values = df[df[\"close\"]][\"median_house_value\"]\n", + "far_values = df[~df[\"close\"]][\"median_house_value\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-stat: 37.992330214201516\n", + "p-value (one-tailed): 1.5032478884296307e-301\n", + "Mean close: 246951.98213501245\n", + "Mean far: 180678.44105790975\n" + ] + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(close_values, far_values, equal_var=False)\n", + "\n", + "# One-tailed (since hypothesis is directional: close > far)\n", + "if t_stat > 0:\n", + " p_value = p_value / 2\n", + "\n", + "print(\"t-stat:\", t_stat)\n", + "print(\"p-value (one-tailed):\", p_value)\n", + "print(\"Mean close:\", close_values.mean())\n", + "print(\"Mean far:\", far_values.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#As p < 0.05 we reject H0. Evidence that houses closer to schools/hospitals are significantly more expensive." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..6fe396f 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,34 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-statistic: 3.3349632905124063\n", + "p-value: 0.0007993609745420597\n" + ] + } + ], "source": [ - "#code here" + "#Set the hypothesis\n", + "\n", + "#H0: mean Dragon HP = mean Grass HP\n", + "#H1: mean Dragon HP > mean Grass HP\n", + "\n", + "alpha = 0.05\n", + "\n", + "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n", + "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]\n", + "\n", + "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"p-value:\", p_value / 2)\n", + "\n", + "# At the 5% significance level, we expect to reject the null hypothesis and conclude that Dragon-type Pokémon have, on average, significantly higher HP than Grass-type Pokémon." ] }, { @@ -313,11 +336,40 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP: t=8.98, p=0.00000, Legendary mean=92.74, Non-Legendary mean=67.18\n", + "Attack: t=10.44, p=0.00000, Legendary mean=116.68, Non-Legendary mean=75.67\n", + "Defense: t=7.64, p=0.00000, Legendary mean=99.66, Non-Legendary mean=71.56\n", + "Sp. Atk: t=13.42, p=0.00000, Legendary mean=122.18, Non-Legendary mean=68.45\n", + "Sp. Def: t=10.02, p=0.00000, Legendary mean=105.94, Non-Legendary mean=68.89\n", + "Speed: t=11.48, p=0.00000, Legendary mean=100.18, Non-Legendary mean=65.46\n" + ] + } + ], "source": [ - "#code here" + "# Hypotheses\n", + "# H0: Legendary and Non-Legendary Pokémon have the same distribution of stats.\n", + "# H1: At least one stat differs significantly between Legendary and Non-Legendary Pokémon.\n", + "\n", + "stats_cols = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "# Run Welch's t-test for each stat\n", + "for col in stats_cols:\n", + " legendary = df[df[\"Legendary\"] == True][col]\n", + " nonlegendary = df[df[\"Legendary\"] == False][col]\n", + " \n", + " t_stat, p_value = st.ttest_ind(legendary, nonlegendary, equal_var=False)\n", + " print(f\"{col}: t={t_stat:.2f}, p={p_value:.5f}, \"\n", + " f\"Legendary mean={legendary.mean():.2f}, Non-Legendary mean={nonlegendary.mean():.2f}\")\n", + "\n", + "# Legendary Pokémon usually have higher base stats across the board.\n", + "# There are significant differences in all six stats, with very small p-values." ] }, { @@ -337,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -453,7 +505,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -486,21 +538,82 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Hypothesis\n", + "# We want to test whether houses close to a school or hospital (distance < 0.5) are more expensive.\n", + "\n", + "# H0: Mean house value (median_house_value) of close houses = mean of far houses.\n", + "# H1: Mean house value of close houses > mean of far houses." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "def euclidean_distance(lon, lat, ref):\n", + " return np.sqrt((lon - ref[0])**2 + (lat - ref[1])**2)\n", + "\n", + "# Distances\n", + "df[\"dist_school\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], school)\n", + "df[\"dist_hospital\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], hospital)\n", + "\n", + "# Close if near either school or hospital\n", + "df[\"close\"] = ((df[\"dist_school\"] < 0.50) | (df[\"dist_hospital\"] < 0.50))\n", + "\n", + "# Groups\n", + "close_values = df[df[\"close\"]][\"median_house_value\"]\n", + "far_values = df[~df[\"close\"]][\"median_house_value\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-stat: 37.992330214201516\n", + "p-value (one-tailed): 1.5032478884296307e-301\n", + "Mean close: 246951.98213501245\n", + "Mean far: 180678.44105790975\n" + ] + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(close_values, far_values, equal_var=False)\n", + "\n", + "# One-tailed (since hypothesis is directional: close > far)\n", + "if t_stat > 0:\n", + " p_value = p_value / 2\n", + "\n", + "print(\"t-stat:\", t_stat)\n", + "print(\"p-value (one-tailed):\", p_value)\n", + "print(\"Mean close:\", close_values.mean())\n", + "print(\"Mean far:\", far_values.mean())" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#As p < 0.05 we reject H0. Evidence that houses closer to schools/hospitals are significantly more expensive." + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -512,9 +625,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }