data-bootcamp-v4 · levgirgin · Aug 29, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -278,7 +278,7 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -297,11 +297,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#code here"
+    "# H0 = there is no difference\n",
+    "# H1 = Dragons have higher avg HP\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reject H0: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scipy.stats import ttest_ind\n",
+    "\n",
+    "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n",
+    "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n",
+    "\n",
+    "# t-test\n",
+    "t_stat, p_value_two_tailed = ttest_ind(dragon_hp, grass_hp, equal_var=False)\n",
+    "\n",
+    "# Convert to one-tailed p-value (Dragon > Grass)\n",
+    "p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)\n",
+    "\n",
+    "t_stat, p_value_one_tailed\n",
+    "\n",
+    "if p_value_one_tailed < 0.05:\n",
+    "    print(\"Reject H0: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\")\n",
+    "else:\n",
+    "    print(\"Fail to reject H0: No significant evidence that Dragon-type Pokémon have higher HP.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3.3349632905124063, 0.0007993609745420598)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t_stat, p_value_one_tailed"
    ]
   },
   {
@@ -313,11 +367,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#code here"
+    "# H0 = No difference in stats\n",
+    "# H1 = Different stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Multivariate linear model\n",
+      "================================================================\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "       Intercept         Value  Num DF  Den DF   F Value  Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "          Wilks' lambda  0.0592 6.0000 793.0000 2100.8338 0.0000\n",
+      "         Pillai's trace  0.9408 6.0000 793.0000 2100.8338 0.0000\n",
+      " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "    Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "----------------------------------------------------------------\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "          Legendary        Value  Num DF  Den DF  F Value Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "             Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
+      "            Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
+      "    Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "       Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "================================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from statsmodels.multivariate.manova import MANOVA\n",
+    "\n",
+    "stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
+    "df_manova = df[stats_cols + ['Legendary']]\n",
+    "\n",
+    "\n",
+    "maov = MANOVA.from_formula('HP + Attack + Defense + Q(\"Sp. Atk\") + Q(\"Sp. Def\") + Speed ~ Legendary', data=df_manova)\n",
+    "result = maov.mv_test()\n",
+    "print(result)"
    ]
   },
   {
@@ -337,7 +437,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -453,7 +553,7 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -483,22 +583,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "from scipy.stats import ttest_ind\n",
+    "\n",
+    "school_coords = (-118, 34)\n",
+    "hospital_coords = (-122, 37)\n",
+    "\n",
+    "def euclidean_distance(lat1, lon1, lat2, lon2):\n",
+    "    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Close to facility house values: [124700. 137500. 169100. ... 277700. 319400. 286100.]\n",
+      "Far from facility house values: [ 66900.  80100.  85700. ... 103600.  85800.  94600.]\n",
+      "t-statistic: 37.992,  p-value: 0.000\n",
+      "Reject H0: Houses close to a school or hospital are significantly more expensive.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df['dist_school'] = euclidean_distance(df['latitude'], df['longitude'], school_coords[1], school_coords[0])\n",
+    "df['dist_hospital'] = euclidean_distance(df['latitude'], df['longitude'], hospital_coords[1], hospital_coords[0])\n",
+    "\n",
+    "df['close_to_facility'] = (df['dist_school'] < 0.50) | (df['dist_hospital'] < 0.50)\n",
+    "\n",
+    "close_prices = df[df['close_to_facility']]['median_house_value']\n",
+    "far_prices = df[~df['close_to_facility']]['median_house_value']\n",
+    "\n",
+    "t_stat, p_value_two_tailed = ttest_ind(close_prices, far_prices, equal_var=False)\n",
+    "p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)\n",
+    "\n",
+    "\n",
+    "print(\"Close to facility house values:\", close_prices.values)\n",
+    "print(\"Far from facility house values:\", far_prices.values)\n",
+    "print(f\"t-statistic: {t_stat:.3f},  p-value: {p_value_one_tailed:.3f}\")\n",
+    "\n",
+    "if p_value_one_tailed < 0.05:\n",
+    "    print(\"Reject H0: Houses close to a school or hospital are significantly more expensive.\")\n",
+    "else:\n",
+    "    print(\"Fail to reject H0: No significant evidence that proximity to a school or hospital increases house prices.\")"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +652,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,