From abaf8d43a1ab041eee2ec042c94aa2d1106904b8 Mon Sep 17 00:00:00 2001 From: NodrrS Date: Sun, 7 Sep 2025 18:50:12 +0200 Subject: [PATCH] labd --- lab-hypothesis-testing.ipynb | 192 ++++++++++++++++++++++++++++++++--- 1 file changed, 176 insertions(+), 16 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..b234a40 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,47 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean(Dragon HP): 83.31 | Mean(Other HP): 68.67\n", + "Welch t-statistic: 3.400\n", + "One-sided p-value (Dragon > Others): 0.0008676\n", + "Conclusion: Reject H0: Dragons have higher average HP.\n" + ] + } + ], "source": [ - "#code here" + "# Hypothesis: Dragons have higher average HP than non-Dragons.\n", + "# H0: mu_Dragon = mu_Others\n", + "# H1: mu_Dragon > mu_Others (one-sided)\n", + "\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "\n", + "dragons = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"].dropna()\n", + "others = df[df[\"Type 1\"] != \"Dragon\"][\"HP\"].dropna()\n", + "\n", + "t_stat, p_two_sided = st.ttest_ind(dragons, others, equal_var=False, nan_policy=\"omit\")\n", + "\n", + "# Convert two-sided p to one-sided for 'greater' alternative:\n", + "# If mean(dragon) > mean(others), p_one_sided = p_two_sided / 2; else 1 - p_two_sided / 2\n", + "mean_diff = dragons.mean() - others.mean()\n", + "if mean_diff > 0:\n", + " p_one_sided = p_two_sided / 2\n", + "else:\n", + " p_one_sided = 1 - (p_two_sided / 2)\n", + "\n", + "alpha = 0.05\n", + "print(f\"Mean(Dragon HP): {dragons.mean():.2f} | Mean(Other HP): {others.mean():.2f}\")\n", + "print(f\"Welch t-statistic: {t_stat:.3f}\")\n", + "print(f\"One-sided p-value (Dragon > Others): {p_one_sided:.4g}\")\n", + "print(\"Conclusion:\", \"Reject H0: Dragons have higher average HP.\"\n", + " if p_one_sided < alpha else \"Fail to reject H0: Insufficient evidence that Dragons have higher HP.\")" ] }, { @@ -313,11 +349,56 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " stat mean_legendary mean_nonlegendary t_stat p_value significant_0.05\n", + " HP 92.738462 67.182313 8.981370 1.002691e-13 True\n", + " Attack 116.676923 75.669388 10.438134 2.520372e-16 True\n", + "Defense 99.661538 71.559184 7.637078 4.826998e-11 True\n", + "Sp. Atk 122.184615 68.454422 13.417450 1.551461e-21 True\n", + "Sp. Def 105.938462 68.892517 10.015697 2.294933e-15 True\n", + " Speed 100.184615 65.455782 11.475044 1.049016e-18 True\n" + ] + } + ], "source": [ - "#code here" + "# Hypothesis: Legendary Pokémon have different mean stats than non-Legendary.\n", + "# For each stat, test: H0: mu_Leg = mu_NonLeg vs H1: mu_Leg != mu_NonLeg (two-sided)\n", + "\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n", + "\n", + "stats_cols = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "leg = df[df[\"Legendary\"] == True]\n", + "non = df[df[\"Legendary\"] == False]\n", + "\n", + "results = []\n", + "for col in stats_cols:\n", + " x = leg[col].dropna()\n", + " y = non[col].dropna()\n", + " t, p = st.ttest_ind(x, y, equal_var=False, nan_policy=\"omit\")\n", + " results.append({\n", + " \"stat\": col,\n", + " \"mean_legendary\": float(np.mean(x)),\n", + " \"mean_nonlegendary\": float(np.mean(y)),\n", + " \"t_stat\": float(t),\n", + " \"p_value\": float(p)\n", + " })\n", + "\n", + "# Present as a small table and mark significance at alpha=0.05 (uncorrected)\n", + "import pandas as pd\n", + "res_df = pd.DataFrame(results)\n", + "res_df[\"significant_0.05\"] = res_df[\"p_value\"] < 0.05\n", + "print(res_df.to_string(index=False))\n", + "\n" ] }, { @@ -337,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -453,7 +534,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -483,10 +564,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N (near) = 6829, N (far) = 10171\n", + "Mean price (near): $246,952\n", + "Mean price (far): $180,678\n", + "Welch t-statistic: 37.992\n", + "One-sided p-value (near > far): 1.503e-301\n", + "Effect size (Hedges' g): 0.595\n", + "Conclusion: Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\n" + ] + } + ], + "source": [ + "\n", + "# H0: mean price (near) == mean price (far)\n", + "# H1: mean price (near) > mean price (far) (one-sided Welch t-test, α = 0.05)\n", + "\n", + "\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "\n", + "# Coordinates given in the prompt\n", + "school_lon, school_lat = -118, 34\n", + "hospital_lon, hospital_lat = -122, 37\n", + "\n", + "# Safety check for expected columns\n", + "required_cols = {\"longitude\", \"latitude\", \"median_house_value\"}\n", + "missing = required_cols - set(df.columns)\n", + "if missing:\n", + " raise ValueError(f\"DataFrame is missing required columns: {missing}\")\n", + "\n", + "# Euclidean distance in degrees (as requested in the hint)\n", + "def euclid_distance(lon1, lat1, lon2, lat2):\n", + " return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n", + "\n", + "# Distances to school and hospital\n", + "dist_school = euclid_distance(df[\"longitude\"], df[\"latitude\"], school_lon, school_lat)\n", + "dist_hosp = euclid_distance(df[\"longitude\"], df[\"latitude\"], hospital_lon, hospital_lat)\n", + "\n", + "# Near if closer than 0.50 degrees to either\n", + "near_threshold = 0.50\n", + "near_mask = (dist_school < near_threshold) | (dist_hosp < near_threshold)\n", + "\n", + "# Split groups\n", + "price_col = \"median_house_value\"\n", + "near_prices = df.loc[near_mask, price_col].dropna()\n", + "far_prices = df.loc[~near_mask, price_col].dropna()\n", + "\n", + "# Welch two-sided t-test, then convert to one-sided for alternative \"near > far\"\n", + "t_stat, p_two_sided = st.ttest_ind(near_prices, far_prices, equal_var=False, nan_policy=\"omit\")\n", + "mean_diff = near_prices.mean() - far_prices.mean()\n", + "p_one_sided = p_two_sided / 2 if mean_diff > 0 else 1 - (p_two_sided / 2)\n", + "\n", + "# Effect size (Hedges' g ~ unbiased Cohen's d with Welch S_p)\n", + "def hedges_g(x, y):\n", + " nx, ny = len(x), len(y)\n", + " sx2, sy2 = np.var(x, ddof=1), np.var(y, ddof=1)\n", + " sp = np.sqrt(((nx - 1)*sx2 + (ny - 1)*sy2) / (nx + ny - 2))\n", + " d = (np.mean(x) - np.mean(y)) / sp\n", + " # small-sample correction\n", + " J = 1 - (3 / (4*(nx + ny) - 9))\n", + " return J * d\n", + "\n", + "g = hedges_g(near_prices.values, far_prices.values)\n", + "\n", + "alpha = 0.05\n", + "print(f\"N (near) = {len(near_prices)}, N (far) = {len(far_prices)}\")\n", + "print(f\"Mean price (near): ${near_prices.mean():,.0f}\")\n", + "print(f\"Mean price (far): ${far_prices.mean():,.0f}\")\n", + "print(f\"Welch t-statistic: {t_stat:.3f}\")\n", + "print(f\"One-sided p-value (near > far): {p_one_sided:.4g}\")\n", + "print(f\"Effect size (Hedges' g): {g:.3f}\")\n", + "print(\"Conclusion:\", \"Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\"\n", + " if p_one_sided < alpha else\n", + " \"Fail to reject H0: Insufficient evidence that near houses are more expensive at α=0.05.\")" + ] }, { "cell_type": "code", @@ -498,7 +658,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -512,7 +672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.2" } }, "nbformat": 4,