data-bootcamp-v4 · Yilak-maker · Sep 28, 2025
diff --git a/hypothesis_testing.ipynb b/hypothesis_testing.ipynb
@@ -0,0 +1,307 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f06c4da6-84ca-4138-9d53-f402ab4b9fd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Libraries\n",
+    "import pandas as pd\n",
+    "import scipy.stats as st"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "019be2df-a44f-42d3-b940-ceb03bb23214",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load data\n",
+    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e7ee790-e9b9-4a17-971f-262ff1e5b292",
+   "metadata": {},
+   "source": [
+    "## Dragon vs Grass HP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a044c64f-8cc3-4990-a4b7-dc65dbf8c805",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dragon vs Grass HP\n",
+    "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n",
+    "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "68fc8ae7-86d9-41ad-9e04-6e81f36bf67e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "T-statistic: 3.3349632905124063\n",
+      "One-sided P-value: 0.0007993609745420598\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Two-sample t-test (two-sided by default)\n",
+    "t_stat, p_val_two_sided = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n",
+    "\n",
+    "# Convert to one-sided p-value safely - whn the hypothesis is directional\n",
+    "if t_stat > 0:\n",
+    "    p_val_one_sided = p_val_two_sided / 2\n",
+    "else:\n",
+    "    p_val_one_sided = 1  # t_stat in opposite direction\n",
+    "\n",
+    "print(\"T-statistic:\", t_stat)\n",
+    "print(\"One-sided P-value:\", p_val_one_sided)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e3ea5d2-f8ec-431d-b7a3-b64e8d5aba15",
+   "metadata": {},
+   "source": [
+    "Since 0.0008 < 0.05:\n",
+    "\n",
+    "We reject H0.\n",
+    "\n",
+    "There is strong statistical evidence that Dragon Pokémon have higher HP than Grass Pokémon, at the 5% significance level."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "349dc507-26e9-4225-bee4-f70a5bec43d5",
+   "metadata": {},
+   "source": [
+    "## Legendary vs Non-Legendary Stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5dc22876-34f2-4246-94d6-dce383f05679",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HP -> T-stat: 8.98, P-val: 0.0000\n",
+      "Attack -> T-stat: 10.44, P-val: 0.0000\n",
+      "Defense -> T-stat: 7.64, P-val: 0.0000\n",
+      "Sp. Atk -> T-stat: 13.42, P-val: 0.0000\n",
+      "Sp. Def -> T-stat: 10.02, P-val: 0.0000\n",
+      "Speed -> T-stat: 11.48, P-val: 0.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n",
+    "\n",
+    "legendary = df[df[\"Legendary\"] == True]\n",
+    "non_legendary = df[df[\"Legendary\"] == False]\n",
+    "\n",
+    "for stat in stats:\n",
+    "    t_stat, p_val = st.ttest_ind(legendary[stat], non_legendary[stat], equal_var=False)\n",
+    "    print(f\"{stat} -> T-stat: {t_stat:.2f}, P-val: {p_val:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3fb884d6-740d-437d-9369-ab11b2b84fbb",
+   "metadata": {},
+   "source": [
+    "H0 (null hypothesis): Legendary and Non-Legendary Pokémon have the same mean for the stat.\n",
+    "\n",
+    "H1 (alternative hypothesis): Legendary and Non-Legendary Pokémon have different means for the stat.\n",
+    "###########\n",
+    "For all stats, P-val = 0.0000 → much smaller than 0.05.\n",
+    "\n",
+    "Rule:\n",
+    "\n",
+    "p < 0.05 → Reject H0\n",
+    "\n",
+    "p ≥ 0.05 → Fail to reject H0\n",
+    "\n",
+    "All your p-values are tiny → Reject H0 for all stats."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0df1d471-57dc-48f1-b109-4b6cfa33c18f",
+   "metadata": {},
+   "source": [
+    "## Challenge 2: California Housing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "cc91f005-335e-4b7f-a758-4165af32e4fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n",
+      "       'total_bedrooms', 'population', 'households', 'median_income',\n",
+      "       'median_house_value'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "# Load dataset\n",
+    "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
+    "\n",
+    "# Check columns\n",
+    "print(df2.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "cc501fd8-e918-49cb-a82e-b361d9cc7b04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Coordinates\n",
+    "school = (-118, 34)\n",
+    "hospital = (-122, 37)\n",
+    "\n",
+    "# Euclidean distance function\n",
+    "def distance(x1, y1, x2, y2):\n",
+    "    return np.sqrt((x1-x2)**2 + (y1-y2)**2)\n",
+    "\n",
+    "# Distances\n",
+    "df2[\"dist_school\"] = distance(df2[\"longitude\"], df2[\"latitude\"], school[0], school[1])\n",
+    "df2[\"dist_hospital\"] = distance(df2[\"longitude\"], df2[\"latitude\"], hospital[0], hospital[1])\n",
+    "\n",
+    "# Close = distance < 0.5 to either school or hospital\n",
+    "df2[\"close\"] = np.where((df2[\"dist_school\"] < 0.5) | (df2[\"dist_hospital\"] < 0.5), 1, 0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3eed33a7-5a4c-4809-94b3-70557921e8cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split dataset into close and far houses\n",
+    "close_houses = df2[df2[\"close\"] == 1][\"median_house_value\"]\n",
+    "far_houses = df2[df2[\"close\"] == 0][\"median_house_value\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c791094e-c324-4dd2-862c-eeda8cf1e07a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "T-statistic: 37.992330214201516\n",
+      "One-sided P-value: 1.5032478884296307e-301\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Perform one-sided t-test (close houses > far houses)\n",
+    "t_stat, p_val_two_sided = st.ttest_ind(close_houses, far_houses, equal_var=False)\n",
+    "\n",
+    "# One-sided p-value\n",
+    "if t_stat > 0:\n",
+    "    p_val_one_sided = p_val_two_sided / 2\n",
+    "else:\n",
+    "    p_val_one_sided = 1  # opposite direction\n",
+    "\n",
+    "print(\"T-statistic:\", t_stat)\n",
+    "print(\"One-sided P-value:\", p_val_one_sided)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "fd8ee5ad-3ccc-442a-b843-9aa5ed041b14",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reject H0: Houses close to a school or hospital are significantly more expensive.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# interpretation of the result\n",
+    "\n",
+    "if p_val_one_sided < 0.05:\n",
+    "    print(\"Reject H0: Houses close to a school or hospital are significantly more expensive.\")\n",
+    "else:\n",
+    "    print(\"Fail to reject H0: No significant evidence that proximity increases house value.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7d56e96-0e3b-4add-9c08-e1d5dc965fcc",
+   "metadata": {},
+   "source": [
+    "# I reject the null hypothesis at the 5% significance level. There is strong statistical evidence that houses located near a school or hospital (within 0.5 units distance) are significantly more expensive than those farther away."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2a685fc-11ce-4ffa-b28e-ae33e1e4a4b7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}