data-bootcamp-v4 · Antonio-Gouveia · Dec 1, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -278,14 +278,14 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
-    "df"
+    "df_pokemon = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
+    "df_pokemon"
    ]
   },
   {
@@ -297,11 +297,58 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\n",
+      "Mean HP Dragon: 83.31\n",
+      "Mean HP Grass: 67.27\n",
+      "T-Statistic: 3.5904\n",
+      "P-Value (One-Tailed): 0.0003\n",
+      "\n",
+      "Finding: Since the p-value (0.0003) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n",
+      "Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "from scipy.stats import ttest_ind\n",
+    "\n",
+    "# 1. Prepare data groups\n",
+    "hp_dragon = df_pokemon[df_pokemon['Type 1'] == 'Dragon']['HP']\n",
+    "hp_grass = df_pokemon[df_pokemon['Type 1'] == 'Grass']['HP']\n",
+    "\n",
+    "# 2. Perform Two-Sample T-Test (Two-tailed by default)\n",
+    "# We assume equal variances (equal_var=False if sample sizes/STDs are very different, but we'll start with True/default)\n",
+    "t_stat, p_value_two_tailed = ttest_ind(hp_dragon, hp_grass, nan_policy='omit')\n",
+    "\n",
+    "# 3. Adjust p-value for a one-tailed test (H_a: mu_Dragon > mu_Grass)\n",
+    "# Since we are testing for \">\" (right-sided) and the T-stat is positive, we divide p-value by 2.\n",
+    "if t_stat > 0:\n",
+    "    p_value_one_tailed = p_value_two_tailed / 2\n",
+    "else:\n",
+    "    # If T-stat is negative, it means Dragon HP is NOT greater, so P-value is close to 1\n",
+    "    p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n",
+    "\n",
+    "alpha = 0.05\n",
+    "\n",
+    "print(\"--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\")\n",
+    "print(f\"Mean HP Dragon: {hp_dragon.mean():.2f}\")\n",
+    "print(f\"Mean HP Grass: {hp_grass.mean():.2f}\")\n",
+    "print(f\"T-Statistic: {t_stat:.4f}\")\n",
+    "print(f\"P-Value (One-Tailed): {p_value_one_tailed:.4f}\")\n",
+    "\n",
+    "# Commentary:\n",
+    "if p_value_one_tailed < alpha:\n",
+    "    print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n",
+    "    print(\"Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")\n",
+    "else:\n",
+    "    print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n",
+    "    print(\"Conclusion: There is no statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")"
    ]
   },
   {
@@ -313,11 +360,103 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\n",
+      "\n",
+      "Statistic: HP\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Statistic: Attack\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Statistic: Defense\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Statistic: Sp. Atk\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Statistic: Sp. Def\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Statistic: Speed\n",
+      "  P-Value: 0.000000\n",
+      "  Finding: Reject H₀\n",
+      "  Conclusion: Significant difference found\n",
+      "\n",
+      "Detailed Results:\n",
+      "         Legendary Mean  Non-Legendary Mean     T-Stat       P-Value\n",
+      "HP            92.738462           67.182313   8.036124  3.330648e-15\n",
+      "Attack       116.676923           75.669388  10.397321  7.827253e-24\n",
+      "Defense       99.661538           71.559184   7.181240  1.584223e-12\n",
+      "Sp. Atk      122.184615           68.454422  14.191406  6.314916e-41\n",
+      "Sp. Def      105.938462           68.892517  11.037751  1.843981e-26\n",
+      "Speed        100.184615           65.455782   9.765234  2.354075e-21\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "# 1. Define stats columns\n",
+    "stat_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
+    "\n",
+    "# 2. Separate data groups\n",
+    "legendary_stats = df_pokemon[df_pokemon['Legendary'] == True]\n",
+    "non_legendary_stats = df_pokemon[df_pokemon['Legendary'] == False]\n",
+    "\n",
+    "results = {}\n",
+    "\n",
+    "print(\"--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\")\n",
+    "\n",
+    "# 3. Perform T-test for each stat\n",
+    "for stat in stat_columns:\n",
+    "    # Extract data for the current stat\n",
+    "    legendary_group = legendary_stats[stat]\n",
+    "    non_legendary_group = non_legendary_stats[stat]\n",
+    "\n",
+    "    # Run T-test\n",
+    "    t_stat, p_value = ttest_ind(legendary_group, non_legendary_group, nan_policy='omit')\n",
+    "    \n",
+    "    # Store results\n",
+    "    results[stat] = {\n",
+    "        'T-Stat': t_stat,\n",
+    "        'P-Value': p_value,\n",
+    "        'Legendary Mean': legendary_group.mean(),\n",
+    "        'Non-Legendary Mean': non_legendary_group.mean()\n",
+    "    }\n",
+    "    \n",
+    "    # Commentary\n",
+    "    alpha = 0.05\n",
+    "    if p_value < alpha:\n",
+    "        finding = \"Reject H₀\"\n",
+    "        conclusion = \"Significant difference found\"\n",
+    "    else:\n",
+    "        finding = \"Fail to Reject H₀\"\n",
+    "        conclusion = \"No significant difference found\"\n",
+    "    \n",
+    "    print(f\"\\nStatistic: {stat}\")\n",
+    "    print(f\"  P-Value: {p_value:.6f}\")\n",
+    "    print(f\"  Finding: {finding}\")\n",
+    "    print(f\"  Conclusion: {conclusion}\")\n",
+    "    \n",
+    "# Final Summary (Optional: display the table of results)\n",
+    "results_df = pd.DataFrame(results).T\n",
+    "print(\"\\nDetailed Results:\")\n",
+    "print(results_df[['Legendary Mean', 'Non-Legendary Mean', 'T-Stat', 'P-Value']])"
    ]
   },
   {
@@ -337,7 +476,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -453,14 +592,14 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
-    "df.head()"
+    "df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
+    "df_housing.head()"
    ]
   },
   {
@@ -483,10 +622,102 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Proximity variable created.\n",
+      "is_close\n",
+      "False    10170\n",
+      "True      6830\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# School coordinates (-118, 34)\n",
+    "SCHOOL_COORD = (-118, 34)\n",
+    "# Hospital coordinates (-122, 37)\n",
+    "HOSPITAL_COORD = (-122, 37)\n",
+    "DISTANCE_THRESHOLD = 0.50\n",
+    "\n",
+    "# 1. Function to calculate Euclidean distance\n",
+    "def euclidean_distance(lat1, lon1, lat2, lon2):\n",
+    "    return np.sqrt((lon2 - lon1)**2 + (lat2 - lat1)**2)\n",
+    "\n",
+    "# 2. Calculate distances\n",
+    "df_housing['dist_to_school'] = euclidean_distance(\n",
+    "    df_housing['latitude'], df_housing['longitude'], SCHOOL_COORD[1], SCHOOL_COORD[0]\n",
+    ")\n",
+    "\n",
+    "df_housing['dist_to_hospital'] = euclidean_distance(\n",
+    "    df_housing['latitude'], df_housing['longitude'], HOSPITAL_COORD[1], HOSPITAL_COORD[0]\n",
+    ")\n",
+    "\n",
+    "# 3. Create the 'is_close' flag\n",
+    "# A house is considered close if dist_to_school OR dist_to_hospital is <= 0.50\n",
+    "df_housing['is_close'] = (df_housing['dist_to_school'] <= DISTANCE_THRESHOLD) | \\\n",
+    "                         (df_housing['dist_to_hospital'] <= DISTANCE_THRESHOLD)\n",
+    "\n",
+    "print(\"\\nProximity variable created.\")\n",
+    "print(df_housing['is_close'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\n",
+      "Mean House Value (Close): $246,934.64\n",
+      "Mean House Value (Far): $180,683.57\n",
+      "T-Statistic: 38.0333\n",
+      "P-Value (One-Tailed): 0.000000\n",
+      "\n",
+      "Finding: Since the p-value (0.000000) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n",
+      "Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Prepare data groups\n",
+    "close_houses = df_housing[df_housing['is_close'] == True]['median_house_value']\n",
+    "far_houses = df_housing[df_housing['is_close'] == False]['median_house_value']\n",
+    "\n",
+    "# 2. Perform Two-Sample T-Test (Two-tailed by default)\n",
+    "t_stat, p_value_two_tailed = ttest_ind(close_houses, far_houses, nan_policy='omit')\n",
+    "\n",
+    "# 3. Adjust p-value for a one-tailed test (H_a: mu_Close > mu_Far)\n",
+    "if t_stat > 0:\n",
+    "    p_value_one_tailed = p_value_two_tailed / 2\n",
+    "else:\n",
+    "    p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n",
+    "\n",
+    "alpha = 0.05\n",
+    "\n",
+    "print(\"\\n--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\")\n",
+    "print(f\"Mean House Value (Close): ${close_houses.mean():,.2f}\")\n",
+    "print(f\"Mean House Value (Far): ${far_houses.mean():,.2f}\")\n",
+    "print(f\"T-Statistic: {t_stat:.4f}\")\n",
+    "print(f\"P-Value (One-Tailed): {p_value_one_tailed:.6f}\")\n",
+    "\n",
+    "# Commentary:\n",
+    "if p_value_one_tailed < alpha:\n",
+    "    print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n",
+    "    print(\"Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")\n",
+    "else:\n",
+    "    print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n",
+    "    print(\"Conclusion: There is no statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")"
+   ]
   },
   {
    "cell_type": "code",
@@ -498,9 +729,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:base] *",
    "language": "python",
-   "name": "python3"
+   "name": "conda-base-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -512,9 +743,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }