data-bootcamp-v4 · NodrrS · Sep 7, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -278,7 +278,7 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -297,11 +297,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 18,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean(Dragon HP): 83.31  |  Mean(Other HP): 68.67\n",
+      "Welch t-statistic: 3.400\n",
+      "One-sided p-value (Dragon > Others): 0.0008676\n",
+      "Conclusion: Reject H0: Dragons have higher average HP.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "# Hypothesis: Dragons have higher average HP than non-Dragons.\n",
+    "# H0: mu_Dragon = mu_Others\n",
+    "# H1: mu_Dragon > mu_Others  (one-sided)\n",
+    "\n",
+    "import pandas as pd\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "dragons = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"].dropna()\n",
+    "others  = df[df[\"Type 1\"] != \"Dragon\"][\"HP\"].dropna()\n",
+    "\n",
+    "t_stat, p_two_sided = st.ttest_ind(dragons, others, equal_var=False, nan_policy=\"omit\")\n",
+    "\n",
+    "# Convert two-sided p to one-sided for 'greater' alternative:\n",
+    "# If mean(dragon) > mean(others), p_one_sided = p_two_sided / 2; else 1 - p_two_sided / 2\n",
+    "mean_diff = dragons.mean() - others.mean()\n",
+    "if mean_diff > 0:\n",
+    "    p_one_sided = p_two_sided / 2\n",
+    "else:\n",
+    "    p_one_sided = 1 - (p_two_sided / 2)\n",
+    "\n",
+    "alpha = 0.05\n",
+    "print(f\"Mean(Dragon HP): {dragons.mean():.2f}  |  Mean(Other HP): {others.mean():.2f}\")\n",
+    "print(f\"Welch t-statistic: {t_stat:.3f}\")\n",
+    "print(f\"One-sided p-value (Dragon > Others): {p_one_sided:.4g}\")\n",
+    "print(\"Conclusion:\", \"Reject H0: Dragons have higher average HP.\"\n",
+    "      if p_one_sided < alpha else \"Fail to reject H0: Insufficient evidence that Dragons have higher HP.\")"
    ]
   },
   {
@@ -313,11 +349,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   stat  mean_legendary  mean_nonlegendary    t_stat      p_value  significant_0.05\n",
+      "     HP       92.738462          67.182313  8.981370 1.002691e-13              True\n",
+      " Attack      116.676923          75.669388 10.438134 2.520372e-16              True\n",
+      "Defense       99.661538          71.559184  7.637078 4.826998e-11              True\n",
+      "Sp. Atk      122.184615          68.454422 13.417450 1.551461e-21              True\n",
+      "Sp. Def      105.938462          68.892517 10.015697 2.294933e-15              True\n",
+      "  Speed      100.184615          65.455782 11.475044 1.049016e-18              True\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "# Hypothesis: Legendary Pokémon have different mean stats than non-Legendary.\n",
+    "# For each stat, test: H0: mu_Leg = mu_NonLeg  vs  H1: mu_Leg != mu_NonLeg  (two-sided)\n",
+    "\n",
+    "import pandas as pd\n",
+    "import scipy.stats as st\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "stats_cols = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n",
+    "\n",
+    "leg = df[df[\"Legendary\"] == True]\n",
+    "non = df[df[\"Legendary\"] == False]\n",
+    "\n",
+    "results = []\n",
+    "for col in stats_cols:\n",
+    "    x = leg[col].dropna()\n",
+    "    y = non[col].dropna()\n",
+    "    t, p = st.ttest_ind(x, y, equal_var=False, nan_policy=\"omit\")\n",
+    "    results.append({\n",
+    "        \"stat\": col,\n",
+    "        \"mean_legendary\": float(np.mean(x)),\n",
+    "        \"mean_nonlegendary\": float(np.mean(y)),\n",
+    "        \"t_stat\": float(t),\n",
+    "        \"p_value\": float(p)\n",
+    "    })\n",
+    "\n",
+    "# Present as a small table and mark significance at alpha=0.05 (uncorrected)\n",
+    "import pandas as pd\n",
+    "res_df = pd.DataFrame(results)\n",
+    "res_df[\"significant_0.05\"] = res_df[\"p_value\"] < 0.05\n",
+    "print(res_df.to_string(index=False))\n",
+    "\n"
    ]
   },
   {
@@ -337,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -453,7 +534,7 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -483,10 +564,89 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "N (near) = 6829, N (far) = 10171\n",
+      "Mean price (near): $246,952\n",
+      "Mean price (far):  $180,678\n",
+      "Welch t-statistic: 37.992\n",
+      "One-sided p-value (near > far): 1.503e-301\n",
+      "Effect size (Hedges' g): 0.595\n",
+      "Conclusion: Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# H0: mean price (near) == mean price (far)\n",
+    "# H1: mean price (near)  >  mean price (far)   (one-sided Welch t-test, α = 0.05)\n",
+    "\n",
+    "\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "# Coordinates given in the prompt\n",
+    "school_lon, school_lat   = -118, 34\n",
+    "hospital_lon, hospital_lat = -122, 37\n",
+    "\n",
+    "# Safety check for expected columns\n",
+    "required_cols = {\"longitude\", \"latitude\", \"median_house_value\"}\n",
+    "missing = required_cols - set(df.columns)\n",
+    "if missing:\n",
+    "    raise ValueError(f\"DataFrame is missing required columns: {missing}\")\n",
+    "\n",
+    "# Euclidean distance in degrees (as requested in the hint)\n",
+    "def euclid_distance(lon1, lat1, lon2, lat2):\n",
+    "    return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n",
+    "\n",
+    "# Distances to school and hospital\n",
+    "dist_school = euclid_distance(df[\"longitude\"], df[\"latitude\"], school_lon, school_lat)\n",
+    "dist_hosp   = euclid_distance(df[\"longitude\"], df[\"latitude\"], hospital_lon, hospital_lat)\n",
+    "\n",
+    "# Near if closer than 0.50 degrees to either\n",
+    "near_threshold = 0.50\n",
+    "near_mask = (dist_school < near_threshold) | (dist_hosp < near_threshold)\n",
+    "\n",
+    "# Split groups\n",
+    "price_col = \"median_house_value\"\n",
+    "near_prices = df.loc[near_mask, price_col].dropna()\n",
+    "far_prices  = df.loc[~near_mask, price_col].dropna()\n",
+    "\n",
+    "# Welch two-sided t-test, then convert to one-sided for alternative \"near > far\"\n",
+    "t_stat, p_two_sided = st.ttest_ind(near_prices, far_prices, equal_var=False, nan_policy=\"omit\")\n",
+    "mean_diff = near_prices.mean() - far_prices.mean()\n",
+    "p_one_sided = p_two_sided / 2 if mean_diff > 0 else 1 - (p_two_sided / 2)\n",
+    "\n",
+    "# Effect size (Hedges' g ~ unbiased Cohen's d with Welch S_p)\n",
+    "def hedges_g(x, y):\n",
+    "    nx, ny = len(x), len(y)\n",
+    "    sx2, sy2 = np.var(x, ddof=1), np.var(y, ddof=1)\n",
+    "    sp = np.sqrt(((nx - 1)*sx2 + (ny - 1)*sy2) / (nx + ny - 2))\n",
+    "    d = (np.mean(x) - np.mean(y)) / sp\n",
+    "    # small-sample correction\n",
+    "    J = 1 - (3 / (4*(nx + ny) - 9))\n",
+    "    return J * d\n",
+    "\n",
+    "g = hedges_g(near_prices.values, far_prices.values)\n",
+    "\n",
+    "alpha = 0.05\n",
+    "print(f\"N (near) = {len(near_prices)}, N (far) = {len(far_prices)}\")\n",
+    "print(f\"Mean price (near): ${near_prices.mean():,.0f}\")\n",
+    "print(f\"Mean price (far):  ${far_prices.mean():,.0f}\")\n",
+    "print(f\"Welch t-statistic: {t_stat:.3f}\")\n",
+    "print(f\"One-sided p-value (near > far): {p_one_sided:.4g}\")\n",
+    "print(f\"Effect size (Hedges' g): {g:.3f}\")\n",
+    "print(\"Conclusion:\", \"Reject H0: Houses near a school or hospital are more expensive (at α=0.05).\"\n",
+    "      if p_one_sided < alpha else\n",
+    "      \"Fail to reject H0: Insufficient evidence that near houses are more expensive at α=0.05.\")"
+   ]
   },
   {
    "cell_type": "code",
@@ -498,7 +658,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +672,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,