Skip to content
Open

labd #363

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 176 additions & 16 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +278,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -297,11 +297,47 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean(Dragon HP): 83.31 | Mean(Other HP): 68.67\n",
"Welch t-statistic: 3.400\n",
"One-sided p-value (Dragon > Others): 0.0008676\n",
"Conclusion: Reject H0: Dragons have higher average HP.\n"
]
}
],
"source": [
"#code here"
"# Hypothesis: Dragons have higher average HP than non-Dragons.\n",
"# H0: mu_Dragon = mu_Others\n",
"# H1: mu_Dragon > mu_Others (one-sided)\n",
"\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"\n",
"dragons = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"].dropna()\n",
"others = df[df[\"Type 1\"] != \"Dragon\"][\"HP\"].dropna()\n",
"\n",
"t_stat, p_two_sided = st.ttest_ind(dragons, others, equal_var=False, nan_policy=\"omit\")\n",
"\n",
"# Convert two-sided p to one-sided for 'greater' alternative:\n",
"# If mean(dragon) > mean(others), p_one_sided = p_two_sided / 2; else 1 - p_two_sided / 2\n",
"mean_diff = dragons.mean() - others.mean()\n",
"if mean_diff > 0:\n",
" p_one_sided = p_two_sided / 2\n",
"else:\n",
" p_one_sided = 1 - (p_two_sided / 2)\n",
"\n",
"alpha = 0.05\n",
"print(f\"Mean(Dragon HP): {dragons.mean():.2f} | Mean(Other HP): {others.mean():.2f}\")\n",
"print(f\"Welch t-statistic: {t_stat:.3f}\")\n",
"print(f\"One-sided p-value (Dragon > Others): {p_one_sided:.4g}\")\n",
"print(\"Conclusion:\", \"Reject H0: Dragons have higher average HP.\"\n",
" if p_one_sided < alpha else \"Fail to reject H0: Insufficient evidence that Dragons have higher HP.\")"
]
},
{
Expand All @@ -313,11 +349,56 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" stat mean_legendary mean_nonlegendary t_stat p_value significant_0.05\n",
" HP 92.738462 67.182313 8.981370 1.002691e-13 True\n",
" Attack 116.676923 75.669388 10.438134 2.520372e-16 True\n",
"Defense 99.661538 71.559184 7.637078 4.826998e-11 True\n",
"Sp. Atk 122.184615 68.454422 13.417450 1.551461e-21 True\n",
"Sp. Def 105.938462 68.892517 10.015697 2.294933e-15 True\n",
" Speed 100.184615 65.455782 11.475044 1.049016e-18 True\n"
]
}
],
"source": [
"#code here"
"# Hypothesis: Legendary Pokémon have different mean stats than non-Legendary.\n",
"# For each stat, test: H0: mu_Leg = mu_NonLeg vs H1: mu_Leg != mu_NonLeg (two-sided)\n",
"\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"import numpy as np\n",
"\n",
"\n",
"stats_cols = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n",
"\n",
"leg = df[df[\"Legendary\"] == True]\n",
"non = df[df[\"Legendary\"] == False]\n",
"\n",
"results = []\n",
"for col in stats_cols:\n",
" x = leg[col].dropna()\n",
" y = non[col].dropna()\n",
" t, p = st.ttest_ind(x, y, equal_var=False, nan_policy=\"omit\")\n",
" results.append({\n",
" \"stat\": col,\n",
" \"mean_legendary\": float(np.mean(x)),\n",
" \"mean_nonlegendary\": float(np.mean(y)),\n",
" \"t_stat\": float(t),\n",
" \"p_value\": float(p)\n",
" })\n",
"\n",
"# Present as a small table and mark significance at alpha=0.05 (uncorrected)\n",
"import pandas as pd\n",
"res_df = pd.DataFrame(results)\n",
"res_df[\"significant_0.05\"] = res_df[\"p_value\"] < 0.05\n",
"print(res_df.to_string(index=False))\n",
"\n"
]
},
{
Expand All @@ -337,7 +418,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,7 +534,7 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -483,10 +564,89 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"N (near) = 6829, N (far) = 10171\n",
"Mean price (near): $246,952\n",
"Mean price (far): $180,678\n",
"Welch t-statistic: 37.992\n",
"One-sided p-value (near > far): 1.503e-301\n",
"Effect size (Hedges' g): 0.595\n",
"Conclusion: Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\n"
]
}
],
"source": [
"\n",
"# H0: mean price (near) == mean price (far)\n",
"# H1: mean price (near) > mean price (far) (one-sided Welch t-test, α = 0.05)\n",
"\n",
"\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"\n",
"# Coordinates given in the prompt\n",
"school_lon, school_lat = -118, 34\n",
"hospital_lon, hospital_lat = -122, 37\n",
"\n",
"# Safety check for expected columns\n",
"required_cols = {\"longitude\", \"latitude\", \"median_house_value\"}\n",
"missing = required_cols - set(df.columns)\n",
"if missing:\n",
" raise ValueError(f\"DataFrame is missing required columns: {missing}\")\n",
"\n",
"# Euclidean distance in degrees (as requested in the hint)\n",
"def euclid_distance(lon1, lat1, lon2, lat2):\n",
" return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n",
"\n",
"# Distances to school and hospital\n",
"dist_school = euclid_distance(df[\"longitude\"], df[\"latitude\"], school_lon, school_lat)\n",
"dist_hosp = euclid_distance(df[\"longitude\"], df[\"latitude\"], hospital_lon, hospital_lat)\n",
"\n",
"# Near if closer than 0.50 degrees to either\n",
"near_threshold = 0.50\n",
"near_mask = (dist_school < near_threshold) | (dist_hosp < near_threshold)\n",
"\n",
"# Split groups\n",
"price_col = \"median_house_value\"\n",
"near_prices = df.loc[near_mask, price_col].dropna()\n",
"far_prices = df.loc[~near_mask, price_col].dropna()\n",
"\n",
"# Welch two-sided t-test, then convert to one-sided for alternative \"near > far\"\n",
"t_stat, p_two_sided = st.ttest_ind(near_prices, far_prices, equal_var=False, nan_policy=\"omit\")\n",
"mean_diff = near_prices.mean() - far_prices.mean()\n",
"p_one_sided = p_two_sided / 2 if mean_diff > 0 else 1 - (p_two_sided / 2)\n",
"\n",
"# Effect size (Hedges' g ~ unbiased Cohen's d with Welch S_p)\n",
"def hedges_g(x, y):\n",
" nx, ny = len(x), len(y)\n",
" sx2, sy2 = np.var(x, ddof=1), np.var(y, ddof=1)\n",
" sp = np.sqrt(((nx - 1)*sx2 + (ny - 1)*sy2) / (nx + ny - 2))\n",
" d = (np.mean(x) - np.mean(y)) / sp\n",
" # small-sample correction\n",
" J = 1 - (3 / (4*(nx + ny) - 9))\n",
" return J * d\n",
"\n",
"g = hedges_g(near_prices.values, far_prices.values)\n",
"\n",
"alpha = 0.05\n",
"print(f\"N (near) = {len(near_prices)}, N (far) = {len(far_prices)}\")\n",
"print(f\"Mean price (near): ${near_prices.mean():,.0f}\")\n",
"print(f\"Mean price (far): ${far_prices.mean():,.0f}\")\n",
"print(f\"Welch t-statistic: {t_stat:.3f}\")\n",
"print(f\"One-sided p-value (near > far): {p_one_sided:.4g}\")\n",
"print(f\"Effect size (Hedges' g): {g:.3f}\")\n",
"print(\"Conclusion:\", \"Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\"\n",
" if p_one_sided < alpha else\n",
" \"Fail to reject H0: Insufficient evidence that near houses are more expensive at α=0.05.\")"
]
},
{
"cell_type": "code",
Expand All @@ -498,7 +658,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +672,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down