data-bootcamp-v4 · lucielopez · Sep 1, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -38,15 +38,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "#libraries\n",
     "import pandas as pd\n",
     "import scipy.stats as st\n",
-    "import numpy as np\n",
-    "\n"
+    "import numpy as np"
    ]
   },
   {
@@ -297,11 +296,73 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#code here"
+    "# H0 : The average HP of Dragon = average of Grass\n",
+    "# H1 : The average HP of Dragon > average of Grass\n",
+    "\n",
+    "# one-sided two-sample t-test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "69.25875"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"HP\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n",
+    "grass_hp  = df[df['Type 1'] == 'Grass']['HP']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "t-statistic: 3.3349632905124063\n",
+      "p-value (bilateral): 0.0015987219490841195\n",
+      "✅ H0 rejected: Dragons have significantly more HP than Grass (5%).\n"
+     ]
+    }
+   ],
+   "source": [
+    "t_stat, p_val = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n",
+    "\n",
+    "print(\"t-statistic:\", t_stat)\n",
+    "print(\"p-value (bilateral):\", p_val)\n",
+    "\n",
+    "# Unilateral hypothesis (Dragon > Grass),\n",
+    "# we divide p_val by 2 and check if the average of the Dragons is much greater\n",
+    "if (dragon_hp.mean() > grass_hp.mean()) and (p_val/2 < 0.05):\n",
+    "    print(\"✅ H0 rejected: Dragons have significantly more HP than Grass (5%).\")\n",
+    "else:\n",
+    "    print(\"❌ H0 cannot be rejected at the 5% threshold.\")"
    ]
   },
   {
@@ -313,11 +374,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#code here"
+    "# H0 :  Legendaries and non-legendaries have the same stats\n",
+    "# H1 : Legendaries and non-legendaries do not have the same stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HP      | mean Legendary=92.7, mean NonLegendary=67.2, F=64.58, p=0.0000\n",
+      "Attack  | mean Legendary=116.7, mean NonLegendary=75.7, F=108.10, p=0.0000\n",
+      "Defense | mean Legendary=99.7, mean NonLegendary=71.6, F=51.57, p=0.0000\n",
+      "Sp. Atk | mean Legendary=122.2, mean NonLegendary=68.5, F=201.40, p=0.0000\n",
+      "Sp. Def | mean Legendary=105.9, mean NonLegendary=68.9, F=121.83, p=0.0000\n",
+      "Speed   | mean Legendary=100.2, mean NonLegendary=65.5, F=95.36, p=0.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "legendary = df[df[\"Legendary\"] == True]\n",
+    "nonlegend = df[df[\"Legendary\"] == False]\n",
+    "\n",
+    "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n",
+    "\n",
+    "for s in stats:\n",
+    "    f, p = st.f_oneway(legendary[s], nonlegend[s])\n",
+    "    print(f\"{s:7s} | mean Legendary={legendary[s].mean():.1f}, mean NonLegendary={nonlegend[s].mean():.1f}, F={f:.2f}, p={p:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We reject H0\n",
+    "\n",
+    "# This means that Legendary Pokémon have significantly different (and on average higher) stats than non-Legendaries in all dimensions tested \n",
+    "# (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed)."
    ]
   },
   {
@@ -337,7 +440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -453,7 +556,7 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -486,19 +589,65 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# H0 : The average price of nearby houses (≤ 0.5) is no different from that of distant houses (> 0.5).\n",
+    "# H1 : The average price of nearby houses is higher than that of distant houses."
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Close mean: 246934.639238653\n",
+      "Far mean: 180683.57168141592\n",
+      "t-stat: 37.97959304116918\n",
+      "p-value (bilateral): 4.642199519694926e-301\n",
+      "✅ H0 rejected : nearby houses are significantly more expensive (5%).\n"
+     ]
+    }
+   ],
+   "source": [
+    "school = (-118, 34)\n",
+    "hospital = (-122, 37)\n",
+    "\n",
+    "def euclidean_distance(x1, y1, x2, y2):\n",
+    "    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)\n",
+    "\n",
+    "# Distance\n",
+    "df[\"dist_school\"] = df.apply(lambda row: euclidean_distance(row[\"longitude\"], row[\"latitude\"], *school), axis=1)\n",
+    "df[\"dist_hospital\"] = df.apply(lambda row: euclidean_distance(row[\"longitude\"], row[\"latitude\"], *hospital), axis=1)\n",
+    "\n",
+    "# Variable indicatrice \"close\"\n",
+    "df[\"close\"] = ((df[\"dist_school\"] <= 0.5) | (df[\"dist_hospital\"] <= 0.5))\n",
+    "\n",
+    "# Separate groups\n",
+    "close_prices = df[df[\"close\"] == True][\"median_house_value\"]\n",
+    "far_prices   = df[df[\"close\"] == False][\"median_house_value\"]\n",
+    "\n",
+    "# T Test\n",
+    "t_stat, p_val = st.ttest_ind(close_prices, far_prices, equal_var=False)\n",
+    "\n",
+    "print(\"Close mean:\", close_prices.mean())\n",
+    "print(\"Far mean:\", far_prices.mean())\n",
+    "print(\"t-stat:\", t_stat)\n",
+    "print(\"p-value (bilateral):\", p_val)\n",
+    "\n",
+    "# Unilateral Test (close > far)\n",
+    "if (close_prices.mean() > far_prices.mean()) and (p_val/2 < 0.05):\n",
+    "    print(\"✅ H0 rejected : nearby houses are significantly more expensive (5%).\")\n",
+    "else:\n",
+    "    print(\"❌ H0 cannot be rejected at the 5% threshold.\")\n"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +661,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,