Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 249 additions & 29 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +278,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -297,11 +297,36 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"np.float64(0.0007993609745420597)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#code here"
"\n",
"HP_Pokemons = df[df['Type 1'] == 'Dragon']['HP']\n",
"HP_Grass = df[df['Type 1'] == 'Grass']['HP']\n",
"\n",
"# H0: Average_HP_Pokemons = Average_HP_Grass\n",
"# H1: Average_HP_Pokemons > Average_HP_Grass\n",
"\n",
"# alpha = 0.05 \n",
"\n",
"# two sample t-test --> test the means of two independent samples of scores.\n",
"_,p_value = st.ttest_ind(HP_Pokemons,HP_Grass,alternative='greater',equal_var=False)\n",
"p_value\n",
"\n",
"# conclusion- -> reject H0, so type dragon have average more HP than type grass\n",
"\n"
]
},
{
Expand All @@ -313,11 +338,48 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Multivariate linear model\n",
"================================================================\n",
" \n",
"----------------------------------------------------------------\n",
" Intercept Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n",
" Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n",
" Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
" Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
"----------------------------------------------------------------\n",
" \n",
"----------------------------------------------------------------\n",
" Legendary Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
" Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
" Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
" Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
"================================================================\n",
"\n"
]
}
],
"source": [
"#code here"
"\n",
"from statsmodels.multivariate.manova import MANOVA\n",
"from patsy import dmatrix\n",
"\n",
"# H0: Legendary and Non-Legendary Pokémon have the same multivariate mean vector of stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed).\n",
"# H1: At least one of the stats’ mean differs, i.e., the multivariate mean vectors are not equal between Legendary and Non-Legendary Pokémon. \n",
"maov = MANOVA.from_formula('HP + Attack + Defense + Q(\"Sp. Atk\") + Q(\"Sp. Def\") + Speed ~ Legendary', data=df)\n",
"print(maov.mv_test())\n",
"\n",
"# by looking at Pr > F 0.0000, reject H0, so There is a statistically significant difference in the combined stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) between Legendary and Non-Legendary Pokémon."
]
},
{
Expand All @@ -337,7 +399,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 47,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -433,34 +495,121 @@
" <td>1.9250</td>\n",
" <td>65500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16995</th>\n",
" <td>-124.26</td>\n",
" <td>40.58</td>\n",
" <td>52.0</td>\n",
" <td>2217.0</td>\n",
" <td>394.0</td>\n",
" <td>907.0</td>\n",
" <td>369.0</td>\n",
" <td>2.3571</td>\n",
" <td>111400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16996</th>\n",
" <td>-124.27</td>\n",
" <td>40.69</td>\n",
" <td>36.0</td>\n",
" <td>2349.0</td>\n",
" <td>528.0</td>\n",
" <td>1194.0</td>\n",
" <td>465.0</td>\n",
" <td>2.5179</td>\n",
" <td>79000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16997</th>\n",
" <td>-124.30</td>\n",
" <td>41.84</td>\n",
" <td>17.0</td>\n",
" <td>2677.0</td>\n",
" <td>531.0</td>\n",
" <td>1244.0</td>\n",
" <td>456.0</td>\n",
" <td>3.0313</td>\n",
" <td>103600.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16998</th>\n",
" <td>-124.30</td>\n",
" <td>41.80</td>\n",
" <td>19.0</td>\n",
" <td>2672.0</td>\n",
" <td>552.0</td>\n",
" <td>1298.0</td>\n",
" <td>478.0</td>\n",
" <td>1.9797</td>\n",
" <td>85800.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16999</th>\n",
" <td>-124.35</td>\n",
" <td>40.54</td>\n",
" <td>52.0</td>\n",
" <td>1820.0</td>\n",
" <td>300.0</td>\n",
" <td>806.0</td>\n",
" <td>270.0</td>\n",
" <td>3.0147</td>\n",
" <td>94600.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17000 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"0 -114.31 34.19 15.0 5612.0 1283.0 \n",
"1 -114.47 34.40 19.0 7650.0 1901.0 \n",
"2 -114.56 33.69 17.0 720.0 174.0 \n",
"3 -114.57 33.64 14.0 1501.0 337.0 \n",
"4 -114.57 33.57 20.0 1454.0 326.0 \n",
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"0 -114.31 34.19 15.0 5612.0 1283.0 \n",
"1 -114.47 34.40 19.0 7650.0 1901.0 \n",
"2 -114.56 33.69 17.0 720.0 174.0 \n",
"3 -114.57 33.64 14.0 1501.0 337.0 \n",
"4 -114.57 33.57 20.0 1454.0 326.0 \n",
"... ... ... ... ... ... \n",
"16995 -124.26 40.58 52.0 2217.0 394.0 \n",
"16996 -124.27 40.69 36.0 2349.0 528.0 \n",
"16997 -124.30 41.84 17.0 2677.0 531.0 \n",
"16998 -124.30 41.80 19.0 2672.0 552.0 \n",
"16999 -124.35 40.54 52.0 1820.0 300.0 \n",
"\n",
" population households median_income median_house_value \n",
"0 1015.0 472.0 1.4936 66900.0 \n",
"1 1129.0 463.0 1.8200 80100.0 \n",
"2 333.0 117.0 1.6509 85700.0 \n",
"3 515.0 226.0 3.1917 73400.0 \n",
"4 624.0 262.0 1.9250 65500.0 "
" population households median_income median_house_value \n",
"0 1015.0 472.0 1.4936 66900.0 \n",
"1 1129.0 463.0 1.8200 80100.0 \n",
"2 333.0 117.0 1.6509 85700.0 \n",
"3 515.0 226.0 3.1917 73400.0 \n",
"4 624.0 262.0 1.9250 65500.0 \n",
"... ... ... ... ... \n",
"16995 907.0 369.0 2.3571 111400.0 \n",
"16996 1194.0 465.0 2.5179 79000.0 \n",
"16997 1244.0 456.0 3.0313 103600.0 \n",
"16998 1298.0 478.0 1.9797 85800.0 \n",
"16999 806.0 270.0 3.0147 94600.0 \n",
"\n",
"[17000 rows x 9 columns]"
]
},
"execution_count": 5,
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"df.head()"
"df"
]
},
{
Expand All @@ -483,22 +632,93 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import math\n",
"\n",
"school_location = (-118, 34)\n",
"hospital_location = (-122, 37)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"# Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n",
"\n",
"def calculate_distance(row,location):\n",
" return math.sqrt((row['longitude'] - location[0])**2 + (row['latitude'] - location[1])**2)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"\n",
"df['dis_from_school'] = df.apply(calculate_distance,args=(school_location, ),axis= 1)\n",
"df['dis_from_hospital'] = df.apply(calculate_distance,args=(hospital_location, ),axis= 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n",
"\n",
"df['close_or_far'] = df.apply(lambda row: 'close' if (row['dis_from_school'] < 0.5) or (row['dis_from_school'] < 0.5) else 'far',axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# - Divide your dataset into houses close and far from either a hospital or school.\n",
"\n",
"s_close = df[df['close_or_far'] == 'close']['median_house_value']\n",
"s_far = df[df['close_or_far'] == 'far']['median_house_value']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"F_onewayResult(statistic=np.float64(577.462645674138), pvalue=np.float64(1.6450819839186202e-125))"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# HO: houses are priced same no matter they are close to or far from school or hospital\n",
"# H1: houses close to either a school or a hospital are more expensive\n",
"\n",
"\n",
"# anova test \n",
"st.f_oneway(s_close,s_far)\n",
"\n",
"# p-value = 1.6450819839186202e-125 < 0.05, so reject H0, so houses close to either a school or a hospital are more statistically more expensive "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "py310",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +732,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.10.18"
}
},
"nbformat": 4,
Expand Down