From 32139c589c19194c602c150518aafd209ab00693 Mon Sep 17 00:00:00 2001 From: mariamnez Date: Mon, 25 Aug 2025 14:14:50 +0200 Subject: [PATCH] Add files via upload --- lab-hypothesis-testing-SOLVED.ipynb | 616 ++++++++++++++++++++++++++++ 1 file changed, 616 insertions(+) create mode 100644 lab-hypothesis-testing-SOLVED.ipynb diff --git a/lab-hypothesis-testing-SOLVED.ipynb b/lab-hypothesis-testing-SOLVED.ipynb new file mode 100644 index 0000000..35693d5 --- /dev/null +++ b/lab-hypothesis-testing-SOLVED.ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load Pokémon into its own variable to avoid collisions with later datasets\n", + "df_poke = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df_poke.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Hypothesis: Dragon-type Pokémon have, on average, higher HP than non-Dragon Pokémon.\n", + "# H0: μ_Dragon ≤ μ_Others vs H1: μ_Dragon > μ_Others\n", + "\n", + "# Identify Dragon Pokémon if 'Dragon' appears in either Type 1 or Type 2\n", + "is_dragon = df_poke['Type 1'].str.contains('Dragon', case=False, na=False)\n", + "if 'Type 2' in df_poke.columns:\n", + " is_dragon = is_dragon | df_poke['Type 2'].str.contains('Dragon', case=False, na=False)\n", + "\n", + "dragon_hp = df_poke.loc[is_dragon, 'HP'].dropna()\n", + "other_hp = df_poke.loc[~is_dragon, 'HP'].dropna()\n", + "\n", + "# Welch's t-test (independent samples, unequal variances). We'll convert from two-sided to one-sided.\n", + "t_stat, p_two_sided = st.ttest_ind(dragon_hp, other_hp, equal_var=False, nan_policy='omit')\n", + "\n", + "# One-sided p-value (H1: Dragon > Others)\n", + "p_one_sided = p_two_sided/2 if t_stat > 0 else 1 - p_two_sided/2\n", + "\n", + "alpha = 0.05\n", + "print(f\"n_dragon={len(dragon_hp)}, n_others={len(other_hp)}\")\n", + "print(f\"mean_dragon_HP={dragon_hp.mean():.2f}, mean_others_HP={other_hp.mean():.2f}\")\n", + "print(f\"Welch t={t_stat:.3f}, one-sided p={p_one_sided:.4g}\")\n", + "print(\"Decision (α=0.05):\", \"Reject H0 (Dragons have higher HP)\" if p_one_sided < alpha else \"Fail to reject H0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Hypothesis: Legendary Pokémon have different stats than non-Legendary (two-sided tests).\n", + "# We'll run Welch's t-tests for each stat and show Bonferroni-adjusted p-values.\n", + "\n", + "# Find the Legendary column name robustly (accepts 'Legendary', 'is_legendary', etc.)\n", + "legend_col = None\n", + "for c in df_poke.columns:\n", + " norm = c.lower().replace(\" \", \"\").replace(\".\", \"\").replace(\"_\", \"\").replace(\"?\", \"\")\n", + " if norm in (\"legendary\", \"islegendary\"):\n", + " legend_col = c\n", + " break\n", + "if legend_col is None:\n", + " raise KeyError(\"Legendary / is_legendary column not found in Pokémon dataset.\")\n", + "\n", + "# Make a boolean mask for legendary\n", + "is_legendary = df_poke[legend_col]\n", + "if is_legendary.dtype != bool:\n", + " is_legendary = is_legendary.astype(str).str.lower().isin(['true','1','yes','t','legendary'])\n", + "\n", + "# Stats columns (keep only those present)\n", + "stats_cols = [c for c in ['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed'] if c in df_poke.columns]\n", + "\n", + "results = []\n", + "for col in stats_cols:\n", + " x = df_poke.loc[is_legendary, col].dropna()\n", + " y = df_poke.loc[~is_legendary, col].dropna()\n", + " t_stat, p_two = st.ttest_ind(x, y, equal_var=False, nan_policy='omit')\n", + " results.append({\n", + " 'stat': col,\n", + " 'n_legendary': len(x),\n", + " 'n_nonlegendary': len(y),\n", + " 'mean_legendary': float(x.mean()) if len(x) else np.nan,\n", + " 'mean_nonlegendary': float(y.mean()) if len(y) else np.nan,\n", + " 't': float(t_stat),\n", + " 'p_two_sided': float(p_two)\n", + " })\n", + "\n", + "res_df = pd.DataFrame(results)\n", + "m = len(res_df)\n", + "res_df['p_bonferroni'] = (res_df['p_two_sided'] * m).clip(upper=1.0)\n", + "res_df.sort_values('p_two_sided', inplace=True)\n", + "res_df.reset_index(drop=True, inplace=True)\n", + "res_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute Euclidean distances to the given school and hospital, then flag 'near' vs 'far'\n", + "import numpy as np\n", + "\n", + "def euclidean_distance(lon, lat, lon0, lat0):\n", + " return np.sqrt((lon - lon0)**2 + (lat - lat0)**2)\n", + "\n", + "school = (-118.0, 34.0)\n", + "hospital = (-122.0, 37.0)\n", + "\n", + "df['dist_school'] = euclidean_distance(df['longitude'], df['latitude'], school[0], school[1])\n", + "df['dist_hospital'] = euclidean_distance(df['longitude'], df['latitude'], hospital[0], hospital[1])\n", + "df['dist_min'] = df[['dist_school','dist_hospital']].min(axis=1)\n", + "\n", + "threshold = 0.50 # as specified\n", + "df['near'] = df['dist_min'] < threshold\n", + "\n", + "df[['longitude','latitude','dist_school','dist_hospital','dist_min','near']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hypothesis: Houses near (within 0.50 of a school or hospital) are more expensive.\n", + "# H0: μ_near ≤ μ_far vs H1: μ_near > μ_far (one-sided)\n", + "\n", + "near_vals = df.loc[df['near'], 'median_house_value'].dropna()\n", + "far_vals = df.loc[~df['near'], 'median_house_value'].dropna()\n", + "\n", + "t_stat, p_two_sided = st.ttest_ind(near_vals, far_vals, equal_var=False, nan_policy='omit')\n", + "p_one_sided = p_two_sided/2 if t_stat > 0 else 1 - p_two_sided/2\n", + "\n", + "alpha = 0.05\n", + "print(f\"n_near={len(near_vals)}, n_far={len(far_vals)}\")\n", + "print(f\"mean_near={near_vals.mean():.2f}, mean_far={far_vals.mean():.2f}\")\n", + "print(f\"Welch t={t_stat:.3f}, one-sided p={p_one_sided:.4g}\")\n", + "print(\"Decision (α=0.05):\", \"Reject H0 (near are pricier)\" if p_one_sided < alpha else \"Fail to reject H0\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}