Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 213 additions & 14 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +278,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -297,11 +297,58 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import scipy.stats as st\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#code here"
"#Two-Sample T-Test \n",
"\n",
"def is_type(df, t):\n",
" return (df['Type 1'].eq(t)) | (df['Type 2'].eq(t))\n",
"\n",
"dragons = df.loc[is_type(df, 'Dragon'), 'HP'].dropna().to_numpy()\n",
"grass = df.loc[is_type(df, 'Grass'), 'HP'].dropna().to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"t=4.499135, one-sided p=0.000007\n",
"Decision: Reject H0 (Dragon > Grass)\n"
]
}
],
"source": [
"tstat, p_two = st.ttest_ind(dragons, grass, equal_var=True) \n",
"\n",
"p_one = p_two/2 if tstat > 0 else 1 - p_two/2\n",
"alpha = 0.05\n",
"print(f\"t={tstat:3f}, one-sided p={p_one:4f}\")\n",
"print(\"Decision:\", \"Reject H0 (Dragon > Grass)\" if p_one < alpha else \"Fail to reject H0\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fail to reject 𝐻0; the data do not provide sufficient evidence that Dragons have higher mean HP than Grass at 5%."
]
},
{
Expand All @@ -313,11 +360,125 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#code here"
"stats_cols = ['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']\n",
"leg = df[df['Legendary'] == True]\n",
"non = df[df['Legendary'] == False]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stat</th>\n",
" <th>t</th>\n",
" <th>p_two_sided</th>\n",
" <th>sig_5%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Sp. Atk</td>\n",
" <td>14.191406</td>\n",
" <td>6.314916e-41</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Sp. Def</td>\n",
" <td>11.037751</td>\n",
" <td>1.843981e-26</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Attack</td>\n",
" <td>10.397321</td>\n",
" <td>7.827253e-24</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Speed</td>\n",
" <td>9.765234</td>\n",
" <td>2.354075e-21</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>HP</td>\n",
" <td>8.036124</td>\n",
" <td>3.330648e-15</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Defense</td>\n",
" <td>7.181240</td>\n",
" <td>1.584223e-12</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stat t p_two_sided sig_5%\n",
"3 Sp. Atk 14.191406 6.314916e-41 True\n",
"4 Sp. Def 11.037751 1.843981e-26 True\n",
"1 Attack 10.397321 7.827253e-24 True\n",
"5 Speed 9.765234 2.354075e-21 True\n",
"0 HP 8.036124 3.330648e-15 True\n",
"2 Defense 7.181240 1.584223e-12 True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rows = []\n",
"for col in stats_cols:\n",
" x = leg[col].dropna().to_numpy()\n",
" y = non[col].dropna().to_numpy()\n",
" t, p = st.ttest_ind(x, y, equal_var=True)\n",
" rows.append({\"stat\": col, \"t\": t, \"p_two_sided\": p, \"sig_5%\": p < 0.05})\n",
"\n",
"pd.DataFrame(rows).sort_values(\"p_two_sided\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"“At the 5% level, Legendary Pokémon show significantly higher average values in [significant stats] compared with Non-Legendary, while differences in [non-significant stats] are not statistically significant."
]
},
{
Expand All @@ -337,7 +498,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,7 +614,7 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -483,22 +644,60 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": []
"source": [
"school_lon, school_lat = -118.0, 34.0\n",
"hospital_lon, hospital_lat = -122.0, 37.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": []
"source": [
"def euclidean(lon, lat, ref_lon, ref_lat):\n",
" return np.sqrt((lon - ref_lon)**2 + (lat - ref_lat)**2)\n",
"\n",
"df['dist_school'] = euclidean(df['longitude'], df['latitude'], school_lon, school_lat)\n",
"df['dist_hospital'] = euclidean(df['longitude'], df['latitude'], hospital_lon, hospital_lat)\n",
"df['near_either'] = (df['dist_school'] < 0.50) | (df['dist_hospital'] < 0.50)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Near n=6829, Far n=10171\n",
"Means: Near=246952, Far=180678\n",
"t=38.046, two-sided p=4.818e-304\n",
"Decision: Significant difference\n"
]
}
],
"source": [
"near_vals = df.loc[df['near_either'], 'median_house_value'].dropna().to_numpy()\n",
"far_vals = df.loc[~df['near_either'], 'median_house_value'].dropna().to_numpy()\n",
"\n",
"t, p = st.ttest_ind(near_vals, far_vals, equal_var=True) # pooled, as in lesson\n",
"alpha = 0.05\n",
"print(f\"Near n={len(near_vals)}, Far n={len(far_vals)}\")\n",
"print(f\"Means: Near={near_vals.mean():.0f}, Far={far_vals.mean():.0f}\")\n",
"print(f\"t={t:.3f}, two-sided p={p:.4g}\")\n",
"print(\"Decision:\", \"Significant difference\" if p < alpha else \"No significant difference\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +711,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down