diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb new file mode 100644 index 0000000..9814711 --- /dev/null +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -0,0 +1,1222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H0: dragoon_mean >= grass_mean\n", + "H1: dragoon_mean < grass_mean\n", + "\n", + "H0: dragoon_mean - grass_mean >= 0\n", + "H1: dragoon_mean - grass_mean < 0" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "hp_dragon = df[df[\"Type 1\"]==\"Dragon\"][\"HP\"]\n", + "hp_grass = df[df[\"Type 1\"]==\"Grass\"][\"HP\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "hp_dragon_std = hp_dragon.std(ddof=1)\n", + "hp_grass_std = hp_grass.std(ddof=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(83.3125), np.float64(67.27142857142857))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hp_dragon_mean, hp_grass_mean" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "HP 7.174327\n", + "Attack 11.329291\n", + "Defense 7.795239\n", + "Sp. Atk 12.377972\n", + "Sp. Def 9.182699\n", + "Speed 8.995969\n", + "dtype: float64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hp_dragon_mean = hp_dragon.mean()\n", + "hp_grass_mean = hp_grass.mean()\n", + "\n", + "hp_dragon_std = hp_dragon.std(ddof=1)\n", + "hp_grass_std = hp_grass.std(ddof=1)\n", + "\n", + "n1 = len(hp_dragon)\n", + "n2 = len(hp_grass)\n", + "\n", + "statistic = (hp_dragon_mean - hp_grass_mean) / np.sqrt( (hp_grass_std**2/n1) + (hp_dragon_std**2/n2))\n", + "statistic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HPAttackDefenseSp. AtkSp. DefSpeed
0454949656545
1606263808060
280828310010080
38010012312212080
4395243605065
.....................
7955010015010015050
79650160110160110110
797801106015013070
798801606017013080
799801101201309070
\n", + "

800 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " HP Attack Defense Sp. Atk Sp. Def Speed\n", + "0 45 49 49 65 65 45\n", + "1 60 62 63 80 80 60\n", + "2 80 82 83 100 100 80\n", + "3 80 100 123 122 120 80\n", + "4 39 52 43 60 50 65\n", + ".. .. ... ... ... ... ...\n", + "795 50 100 150 100 150 50\n", + "796 50 160 110 160 110 110\n", + "797 80 110 60 150 130 70\n", + "798 80 160 60 170 130 80\n", + "799 80 110 120 130 90 70\n", + "\n", + "[800 rows x 6 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats = \"HP\", \"Attack\",\t\"Defense\",\t\"Sp. Atk\",\t\"Sp. Def\",\t\"Speed\"\n", + "df_stats = df[list(stats)]\n", + "df_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HPAttackDefenseSp. AtkSp. DefSpeed
0454949656545
1606263808060
280828310010080
38010012312212080
4395243605065
.....................
78785100122587554
788556985323528
78995117184444628
790403035454055
7918570809780123
\n", + "

735 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " HP Attack Defense Sp. Atk Sp. Def Speed\n", + "0 45 49 49 65 65 45\n", + "1 60 62 63 80 80 60\n", + "2 80 82 83 100 100 80\n", + "3 80 100 123 122 120 80\n", + "4 39 52 43 60 50 65\n", + ".. .. ... ... ... ... ...\n", + "787 85 100 122 58 75 54\n", + "788 55 69 85 32 35 28\n", + "789 95 117 184 44 46 28\n", + "790 40 30 35 45 40 55\n", + "791 85 70 80 97 80 123\n", + "\n", + "[735 rows x 6 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "leg = df_stats[df[\"Legendary\"]== True]\n", + "leg\n", + "non_leg = df_stats[df[\"Legendary\"] == False]\n", + "non_leg" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-stat: 11.47504444631443\n", + "p-value: 1.049016311882451e-18\n" + ] + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(leg, non_leg, equal_var=False) # Welch\n", + "print(\"t-stat:\", t_stat)\n", + "print(\"p-value:\", p_value)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP: t=8.981, p=0.0000\n", + "Attack: t=10.438, p=0.0000\n", + "Defense: t=7.637, p=0.0000\n", + "Sp. Atk: t=13.417, p=0.0000\n", + "Sp. Def: t=10.016, p=0.0000\n", + "Speed: t=11.475, p=0.0000\n" + ] + } + ], + "source": [ + "for col in stats:\n", + " leg = df[df[\"Legendary\"] == True][col]\n", + " non_leg = df[df[\"Legendary\"] == False][col]\n", + " \n", + " t_stat, p_value = st.ttest_ind(leg, non_leg, equal_var=False)\n", + " print(f\"{col}: t={t_stat:.3f}, p={p_value:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 4.034476\n", + "1 3.890617\n", + "2 3.792690\n", + "3 3.787149\n", + "4 3.794443\n", + " ... \n", + "16995 8.851147\n", + "16996 8.939855\n", + "16997 9.848208\n", + "16998 9.816394\n", + "16999 8.882100\n", + "Length: 17000, dtype: float64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "\n", + "lon0, lat0 = -118.34, 34.0\n", + "\n", + "df_distance_school = np.sqrt((df['longitude'] - lon0)**2 + (df['latitude'] - lat0)**2)\n", + "df_distance_school" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 8.535789\n", + "1 8.316850\n", + "2 8.482464\n", + "3 8.492915\n", + "4 8.520851\n", + " ... \n", + "16995 4.048271\n", + "16996 4.150434\n", + "16997 5.210614\n", + "16998 5.173480\n", + "16999 4.056107\n", + "Length: 17000, dtype: float64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lon1, lat1 = -122.37, 37.0\n", + "\n", + "df_distance_hospital = np.sqrt((df['longitude'] - lon1)**2 + (df['latitude'] - lat1)**2)\n", + "df_distance_hospital" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "16995 False\n", + "16996 False\n", + "16997 False\n", + "16998 False\n", + "16999 False\n", + "Length: 17000, dtype: bool" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "price_close_school = df[df_distance_school <= 0.5]['median_house_value']\n", + "df_school_close" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 66900.0\n", + "1 80100.0\n", + "2 85700.0\n", + "3 73400.0\n", + "4 65500.0\n", + " ... \n", + "16995 111400.0\n", + "16996 79000.0\n", + "16997 103600.0\n", + "16998 85800.0\n", + "16999 94600.0\n", + "Name: median_house_value, Length: 11743, dtype: float64" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_school_far = df[df_distance_school > 0.5]['median_house_value']\n", + "df_school_far" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 66900.0\n", + "1 80100.0\n", + "2 85700.0\n", + "3 73400.0\n", + "4 65500.0\n", + " ... \n", + "16995 111400.0\n", + "16996 79000.0\n", + "16997 103600.0\n", + "16998 85800.0\n", + "16999 94600.0\n", + "Name: median_house_value, Length: 16545, dtype: float64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hospital_close = df[df_distance_hospital < 0.5]['median_house_value']\n", + "df_hospital_close" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13131 289500.0\n", + "13132 321100.0\n", + "13133 308800.0\n", + "13229 293300.0\n", + "13230 356600.0\n", + " ... \n", + "15384 500001.0\n", + "15587 500001.0\n", + "15685 456300.0\n", + "15686 286100.0\n", + "15971 354300.0\n", + "Name: median_house_value, Length: 455, dtype: float64" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hospital_far = df[df_distance_hospital > 0.5]['median_house_value']\n", + "df_hospital_far" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(-181.21516936688832)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_mean_close = df_school_close.mean()\n", + "s_mean_far = df_school_far.mean()\n", + "\n", + "s_std_close = df_school_close.std(ddof=1)\n", + "s_std_far = df_school_far.std(ddof=1)\n", + "\n", + "n_close = len(price_close_school)\n", + "n_far = len(df_school_far)\n", + "\n", + "t_stat = (s_mean_close - s_mean_far) / np.sqrt((s_std_close**2 / n_close) + (s_std_far**2 / n_far))\n", + "t_stat\n" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(-91.69177535787794)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "h_mean_close = df_hospital_close.mean()\n", + "h_mean_far = df_hospital_far.mean()\n", + "\n", + "h_std_close = df_hospital_close.std(ddof=1)\n", + "h_std_far = df_hospital_far.std(ddof=1)\n", + "\n", + "n_close = len(price_close_school)\n", + "n_far = len(df_school_far)\n", + "\n", + "t_stat = (h_mean_close - h_mean_far) / np.sqrt((h_std_close**2 / n_close) + (h_std_far**2 / n_far))\n", + "t_stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/anaconda_projects/db/project_filebrowser.db b/anaconda_projects/db/project_filebrowser.db new file mode 100644 index 0000000..6fab372 Binary files /dev/null and b/anaconda_projects/db/project_filebrowser.db differ diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..9814711 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -295,13 +295,91 @@ "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "H0: dragoon_mean >= grass_mean\n", + "H1: dragoon_mean < grass_mean\n", + "\n", + "H0: dragoon_mean - grass_mean >= 0\n", + "H1: dragoon_mean - grass_mean < 0" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "hp_dragon = df[df[\"Type 1\"]==\"Dragon\"][\"HP\"]\n", + "hp_grass = df[df[\"Type 1\"]==\"Grass\"][\"HP\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "#code here" + "hp_dragon_std = hp_dragon.std(ddof=1)\n", + "hp_grass_std = hp_grass.std(ddof=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(83.3125), np.float64(67.27142857142857))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hp_dragon_mean, hp_grass_mean" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "HP 7.174327\n", + "Attack 11.329291\n", + "Defense 7.795239\n", + "Sp. Atk 12.377972\n", + "Sp. Def 9.182699\n", + "Speed 8.995969\n", + "dtype: float64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hp_dragon_mean = hp_dragon.mean()\n", + "hp_grass_mean = hp_grass.mean()\n", + "\n", + "hp_dragon_std = hp_dragon.std(ddof=1)\n", + "hp_grass_std = hp_grass.std(ddof=1)\n", + "\n", + "n1 = len(hp_dragon)\n", + "n2 = len(hp_grass)\n", + "\n", + "statistic = (hp_dragon_mean - hp_grass_mean) / np.sqrt( (hp_grass_std**2/n1) + (hp_dragon_std**2/n2))\n", + "statistic" ] }, { @@ -315,9 +393,382 @@ "cell_type": "code", "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HPAttackDefenseSp. AtkSp. DefSpeed
0454949656545
1606263808060
280828310010080
38010012312212080
4395243605065
.....................
7955010015010015050
79650160110160110110
797801106015013070
798801606017013080
799801101201309070
\n", + "

800 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " HP Attack Defense Sp. Atk Sp. Def Speed\n", + "0 45 49 49 65 65 45\n", + "1 60 62 63 80 80 60\n", + "2 80 82 83 100 100 80\n", + "3 80 100 123 122 120 80\n", + "4 39 52 43 60 50 65\n", + ".. .. ... ... ... ... ...\n", + "795 50 100 150 100 150 50\n", + "796 50 160 110 160 110 110\n", + "797 80 110 60 150 130 70\n", + "798 80 160 60 170 130 80\n", + "799 80 110 120 130 90 70\n", + "\n", + "[800 rows x 6 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats = \"HP\", \"Attack\",\t\"Defense\",\t\"Sp. Atk\",\t\"Sp. Def\",\t\"Speed\"\n", + "df_stats = df[list(stats)]\n", + "df_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HPAttackDefenseSp. AtkSp. DefSpeed
0454949656545
1606263808060
280828310010080
38010012312212080
4395243605065
.....................
78785100122587554
788556985323528
78995117184444628
790403035454055
7918570809780123
\n", + "

735 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " HP Attack Defense Sp. Atk Sp. Def Speed\n", + "0 45 49 49 65 65 45\n", + "1 60 62 63 80 80 60\n", + "2 80 82 83 100 100 80\n", + "3 80 100 123 122 120 80\n", + "4 39 52 43 60 50 65\n", + ".. .. ... ... ... ... ...\n", + "787 85 100 122 58 75 54\n", + "788 55 69 85 32 35 28\n", + "789 95 117 184 44 46 28\n", + "790 40 30 35 45 40 55\n", + "791 85 70 80 97 80 123\n", + "\n", + "[735 rows x 6 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#code here" + "leg = df_stats[df[\"Legendary\"]== True]\n", + "leg\n", + "non_leg = df_stats[df[\"Legendary\"] == False]\n", + "non_leg" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-stat: 11.47504444631443\n", + "p-value: 1.049016311882451e-18\n" + ] + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(leg, non_leg, equal_var=False) # Welch\n", + "print(\"t-stat:\", t_stat)\n", + "print(\"p-value:\", p_value)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP: t=8.981, p=0.0000\n", + "Attack: t=10.438, p=0.0000\n", + "Defense: t=7.637, p=0.0000\n", + "Sp. Atk: t=13.417, p=0.0000\n", + "Sp. Def: t=10.016, p=0.0000\n", + "Speed: t=11.475, p=0.0000\n" + ] + } + ], + "source": [ + "for col in stats:\n", + " leg = df[df[\"Legendary\"] == True][col]\n", + " non_leg = df[df[\"Legendary\"] == False][col]\n", + " \n", + " t_stat, p_value = st.ttest_ind(leg, non_leg, equal_var=False)\n", + " print(f\"{col}: t={t_stat:.3f}, p={p_value:.4f}\")" ] }, { @@ -337,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -453,7 +904,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -483,10 +934,261 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "0 4.034476\n", + "1 3.890617\n", + "2 3.792690\n", + "3 3.787149\n", + "4 3.794443\n", + " ... \n", + "16995 8.851147\n", + "16996 8.939855\n", + "16997 9.848208\n", + "16998 9.816394\n", + "16999 8.882100\n", + "Length: 17000, dtype: float64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "\n", + "lon0, lat0 = -118.34, 34.0\n", + "\n", + "df_distance_school = np.sqrt((df['longitude'] - lon0)**2 + (df['latitude'] - lat0)**2)\n", + "df_distance_school" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 8.535789\n", + "1 8.316850\n", + "2 8.482464\n", + "3 8.492915\n", + "4 8.520851\n", + " ... \n", + "16995 4.048271\n", + "16996 4.150434\n", + "16997 5.210614\n", + "16998 5.173480\n", + "16999 4.056107\n", + "Length: 17000, dtype: float64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lon1, lat1 = -122.37, 37.0\n", + "\n", + "df_distance_hospital = np.sqrt((df['longitude'] - lon1)**2 + (df['latitude'] - lat1)**2)\n", + "df_distance_hospital" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "16995 False\n", + "16996 False\n", + "16997 False\n", + "16998 False\n", + "16999 False\n", + "Length: 17000, dtype: bool" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "price_close_school = df[df_distance_school <= 0.5]['median_house_value']\n", + "df_school_close" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 66900.0\n", + "1 80100.0\n", + "2 85700.0\n", + "3 73400.0\n", + "4 65500.0\n", + " ... \n", + "16995 111400.0\n", + "16996 79000.0\n", + "16997 103600.0\n", + "16998 85800.0\n", + "16999 94600.0\n", + "Name: median_house_value, Length: 11743, dtype: float64" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_school_far = df[df_distance_school > 0.5]['median_house_value']\n", + "df_school_far" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 66900.0\n", + "1 80100.0\n", + "2 85700.0\n", + "3 73400.0\n", + "4 65500.0\n", + " ... \n", + "16995 111400.0\n", + "16996 79000.0\n", + "16997 103600.0\n", + "16998 85800.0\n", + "16999 94600.0\n", + "Name: median_house_value, Length: 16545, dtype: float64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hospital_close = df[df_distance_hospital < 0.5]['median_house_value']\n", + "df_hospital_close" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13131 289500.0\n", + "13132 321100.0\n", + "13133 308800.0\n", + "13229 293300.0\n", + "13230 356600.0\n", + " ... \n", + "15384 500001.0\n", + "15587 500001.0\n", + "15685 456300.0\n", + "15686 286100.0\n", + "15971 354300.0\n", + "Name: median_house_value, Length: 455, dtype: float64" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hospital_far = df[df_distance_hospital > 0.5]['median_house_value']\n", + "df_hospital_far" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(-181.21516936688832)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_mean_close = df_school_close.mean()\n", + "s_mean_far = df_school_far.mean()\n", + "\n", + "s_std_close = df_school_close.std(ddof=1)\n", + "s_std_far = df_school_far.std(ddof=1)\n", + "\n", + "n_close = len(price_close_school)\n", + "n_far = len(df_school_far)\n", + "\n", + "t_stat = (s_mean_close - s_mean_far) / np.sqrt((s_std_close**2 / n_close) + (s_std_far**2 / n_far))\n", + "t_stat\n" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(-91.69177535787794)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "h_mean_close = df_hospital_close.mean()\n", + "h_mean_far = df_hospital_far.mean()\n", + "\n", + "h_std_close = df_hospital_close.std(ddof=1)\n", + "h_std_far = df_hospital_far.std(ddof=1)\n", + "\n", + "n_close = len(price_close_school)\n", + "n_far = len(df_school_far)\n", + "\n", + "t_stat = (h_mean_close - h_mean_far) / np.sqrt((h_std_close**2 / n_close) + (h_std_far**2 / n_far))\n", + "t_stat" + ] }, { "cell_type": "code", @@ -498,9 +1200,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -512,9 +1214,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }