From 01fea48298eaf7b2fbcf589f663a6365d0ad2dcd Mon Sep 17 00:00:00 2001 From: Yilak Kebede Date: Sun, 28 Sep 2025 10:38:04 +0200 Subject: [PATCH] Completed Hypothesis Testing Lab --- hypothesis_testing.ipynb | 307 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 hypothesis_testing.ipynb diff --git a/hypothesis_testing.ipynb b/hypothesis_testing.ipynb new file mode 100644 index 0000000..efbf97f --- /dev/null +++ b/hypothesis_testing.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f06c4da6-84ca-4138-9d53-f402ab4b9fd8", + "metadata": {}, + "outputs": [], + "source": [ + "# Libraries\n", + "import pandas as pd\n", + "import scipy.stats as st" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "019be2df-a44f-42d3-b940-ceb03bb23214", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "0e7ee790-e9b9-4a17-971f-262ff1e5b292", + "metadata": {}, + "source": [ + "## Dragon vs Grass HP" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a044c64f-8cc3-4990-a4b7-dc65dbf8c805", + "metadata": {}, + "outputs": [], + "source": [ + "# Dragon vs Grass HP\n", + "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n", + "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "68fc8ae7-86d9-41ad-9e04-6e81f36bf67e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 3.3349632905124063\n", + "One-sided P-value: 0.0007993609745420598\n" + ] + } + ], + "source": [ + "# Two-sample t-test (two-sided by default)\n", + "t_stat, p_val_two_sided = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "# Convert to one-sided p-value safely - whn the hypothesis is directional\n", + "if t_stat > 0:\n", + " p_val_one_sided = p_val_two_sided / 2\n", + "else:\n", + " p_val_one_sided = 1 # t_stat in opposite direction\n", + "\n", + "print(\"T-statistic:\", t_stat)\n", + "print(\"One-sided P-value:\", p_val_one_sided)" + ] + }, + { + "cell_type": "markdown", + "id": "1e3ea5d2-f8ec-431d-b7a3-b64e8d5aba15", + "metadata": {}, + "source": [ + "Since 0.0008 < 0.05:\n", + "\n", + "We reject H0.\n", + "\n", + "There is strong statistical evidence that Dragon Pokémon have higher HP than Grass Pokémon, at the 5% significance level." + ] + }, + { + "cell_type": "markdown", + "id": "349dc507-26e9-4225-bee4-f70a5bec43d5", + "metadata": {}, + "source": [ + "## Legendary vs Non-Legendary Stats" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5dc22876-34f2-4246-94d6-dce383f05679", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP -> T-stat: 8.98, P-val: 0.0000\n", + "Attack -> T-stat: 10.44, P-val: 0.0000\n", + "Defense -> T-stat: 7.64, P-val: 0.0000\n", + "Sp. Atk -> T-stat: 13.42, P-val: 0.0000\n", + "Sp. Def -> T-stat: 10.02, P-val: 0.0000\n", + "Speed -> T-stat: 11.48, P-val: 0.0000\n" + ] + } + ], + "source": [ + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "legendary = df[df[\"Legendary\"] == True]\n", + "non_legendary = df[df[\"Legendary\"] == False]\n", + "\n", + "for stat in stats:\n", + " t_stat, p_val = st.ttest_ind(legendary[stat], non_legendary[stat], equal_var=False)\n", + " print(f\"{stat} -> T-stat: {t_stat:.2f}, P-val: {p_val:.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3fb884d6-740d-437d-9369-ab11b2b84fbb", + "metadata": {}, + "source": [ + "H0 (null hypothesis): Legendary and Non-Legendary Pokémon have the same mean for the stat.\n", + "\n", + "H1 (alternative hypothesis): Legendary and Non-Legendary Pokémon have different means for the stat.\n", + "###########\n", + "For all stats, P-val = 0.0000 → much smaller than 0.05.\n", + "\n", + "Rule:\n", + "\n", + "p < 0.05 → Reject H0\n", + "\n", + "p ≥ 0.05 → Fail to reject H0\n", + "\n", + "All your p-values are tiny → Reject H0 for all stats." + ] + }, + { + "cell_type": "markdown", + "id": "0df1d471-57dc-48f1-b109-4b6cfa33c18f", + "metadata": {}, + "source": [ + "## Challenge 2: California Housing" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cc91f005-335e-4b7f-a758-4165af32e4fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n", + " 'total_bedrooms', 'population', 'households', 'median_income',\n", + " 'median_house_value'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import scipy.stats as st\n", + "\n", + "# Load dataset\n", + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "\n", + "# Check columns\n", + "print(df2.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cc501fd8-e918-49cb-a82e-b361d9cc7b04", + "metadata": {}, + "outputs": [], + "source": [ + "# Coordinates\n", + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "# Euclidean distance function\n", + "def distance(x1, y1, x2, y2):\n", + " return np.sqrt((x1-x2)**2 + (y1-y2)**2)\n", + "\n", + "# Distances\n", + "df2[\"dist_school\"] = distance(df2[\"longitude\"], df2[\"latitude\"], school[0], school[1])\n", + "df2[\"dist_hospital\"] = distance(df2[\"longitude\"], df2[\"latitude\"], hospital[0], hospital[1])\n", + "\n", + "# Close = distance < 0.5 to either school or hospital\n", + "df2[\"close\"] = np.where((df2[\"dist_school\"] < 0.5) | (df2[\"dist_hospital\"] < 0.5), 1, 0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3eed33a7-5a4c-4809-94b3-70557921e8cb", + "metadata": {}, + "outputs": [], + "source": [ + "# split dataset into close and far houses\n", + "close_houses = df2[df2[\"close\"] == 1][\"median_house_value\"]\n", + "far_houses = df2[df2[\"close\"] == 0][\"median_house_value\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c791094e-c324-4dd2-862c-eeda8cf1e07a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 37.992330214201516\n", + "One-sided P-value: 1.5032478884296307e-301\n" + ] + } + ], + "source": [ + "#Perform one-sided t-test (close houses > far houses)\n", + "t_stat, p_val_two_sided = st.ttest_ind(close_houses, far_houses, equal_var=False)\n", + "\n", + "# One-sided p-value\n", + "if t_stat > 0:\n", + " p_val_one_sided = p_val_two_sided / 2\n", + "else:\n", + " p_val_one_sided = 1 # opposite direction\n", + "\n", + "print(\"T-statistic:\", t_stat)\n", + "print(\"One-sided P-value:\", p_val_one_sided)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fd8ee5ad-3ccc-442a-b843-9aa5ed041b14", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reject H0: Houses close to a school or hospital are significantly more expensive.\n" + ] + } + ], + "source": [ + "# interpretation of the result\n", + "\n", + "if p_val_one_sided < 0.05:\n", + " print(\"Reject H0: Houses close to a school or hospital are significantly more expensive.\")\n", + "else:\n", + " print(\"Fail to reject H0: No significant evidence that proximity increases house value.\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "d7d56e96-0e3b-4add-9c08-e1d5dc965fcc", + "metadata": {}, + "source": [ + "# I reject the null hypothesis at the 5% significance level. There is strong statistical evidence that houses located near a school or hospital (within 0.5 units distance) are significantly more expensive than those farther away." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2a685fc-11ce-4ffa-b28e-ae33e1e4a4b7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}