Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 253 additions & 22 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,14 +278,14 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
"df"
"df_pokemon = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
"df_pokemon"
]
},
{
Expand All @@ -297,11 +297,58 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\n",
"Mean HP Dragon: 83.31\n",
"Mean HP Grass: 67.27\n",
"T-Statistic: 3.5904\n",
"P-Value (One-Tailed): 0.0003\n",
"\n",
"Finding: Since the p-value (0.0003) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n",
"Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\n"
]
}
],
"source": [
"#code here"
"from scipy.stats import ttest_ind\n",
"\n",
"# 1. Prepare data groups\n",
"hp_dragon = df_pokemon[df_pokemon['Type 1'] == 'Dragon']['HP']\n",
"hp_grass = df_pokemon[df_pokemon['Type 1'] == 'Grass']['HP']\n",
"\n",
"# 2. Perform Two-Sample T-Test (Two-tailed by default)\n",
"# We assume equal variances (equal_var=False if sample sizes/STDs are very different, but we'll start with True/default)\n",
"t_stat, p_value_two_tailed = ttest_ind(hp_dragon, hp_grass, nan_policy='omit')\n",
"\n",
"# 3. Adjust p-value for a one-tailed test (H_a: mu_Dragon > mu_Grass)\n",
"# Since we are testing for \">\" (right-sided) and the T-stat is positive, we divide p-value by 2.\n",
"if t_stat > 0:\n",
" p_value_one_tailed = p_value_two_tailed / 2\n",
"else:\n",
" # If T-stat is negative, it means Dragon HP is NOT greater, so P-value is close to 1\n",
" p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n",
"\n",
"alpha = 0.05\n",
"\n",
"print(\"--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\")\n",
"print(f\"Mean HP Dragon: {hp_dragon.mean():.2f}\")\n",
"print(f\"Mean HP Grass: {hp_grass.mean():.2f}\")\n",
"print(f\"T-Statistic: {t_stat:.4f}\")\n",
"print(f\"P-Value (One-Tailed): {p_value_one_tailed:.4f}\")\n",
"\n",
"# Commentary:\n",
"if p_value_one_tailed < alpha:\n",
" print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n",
" print(\"Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")\n",
"else:\n",
" print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n",
" print(\"Conclusion: There is no statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")"
]
},
{
Expand All @@ -313,11 +360,103 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\n",
"\n",
"Statistic: HP\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Statistic: Attack\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Statistic: Defense\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Statistic: Sp. Atk\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Statistic: Sp. Def\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Statistic: Speed\n",
" P-Value: 0.000000\n",
" Finding: Reject H₀\n",
" Conclusion: Significant difference found\n",
"\n",
"Detailed Results:\n",
" Legendary Mean Non-Legendary Mean T-Stat P-Value\n",
"HP 92.738462 67.182313 8.036124 3.330648e-15\n",
"Attack 116.676923 75.669388 10.397321 7.827253e-24\n",
"Defense 99.661538 71.559184 7.181240 1.584223e-12\n",
"Sp. Atk 122.184615 68.454422 14.191406 6.314916e-41\n",
"Sp. Def 105.938462 68.892517 11.037751 1.843981e-26\n",
"Speed 100.184615 65.455782 9.765234 2.354075e-21\n"
]
}
],
"source": [
"#code here"
"# 1. Define stats columns\n",
"stat_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
"\n",
"# 2. Separate data groups\n",
"legendary_stats = df_pokemon[df_pokemon['Legendary'] == True]\n",
"non_legendary_stats = df_pokemon[df_pokemon['Legendary'] == False]\n",
"\n",
"results = {}\n",
"\n",
"print(\"--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\")\n",
"\n",
"# 3. Perform T-test for each stat\n",
"for stat in stat_columns:\n",
" # Extract data for the current stat\n",
" legendary_group = legendary_stats[stat]\n",
" non_legendary_group = non_legendary_stats[stat]\n",
"\n",
" # Run T-test\n",
" t_stat, p_value = ttest_ind(legendary_group, non_legendary_group, nan_policy='omit')\n",
" \n",
" # Store results\n",
" results[stat] = {\n",
" 'T-Stat': t_stat,\n",
" 'P-Value': p_value,\n",
" 'Legendary Mean': legendary_group.mean(),\n",
" 'Non-Legendary Mean': non_legendary_group.mean()\n",
" }\n",
" \n",
" # Commentary\n",
" alpha = 0.05\n",
" if p_value < alpha:\n",
" finding = \"Reject H₀\"\n",
" conclusion = \"Significant difference found\"\n",
" else:\n",
" finding = \"Fail to Reject H₀\"\n",
" conclusion = \"No significant difference found\"\n",
" \n",
" print(f\"\\nStatistic: {stat}\")\n",
" print(f\" P-Value: {p_value:.6f}\")\n",
" print(f\" Finding: {finding}\")\n",
" print(f\" Conclusion: {conclusion}\")\n",
" \n",
"# Final Summary (Optional: display the table of results)\n",
"results_df = pd.DataFrame(results).T\n",
"print(\"\\nDetailed Results:\")\n",
"print(results_df[['Legendary Mean', 'Non-Legendary Mean', 'T-Stat', 'P-Value']])"
]
},
{
Expand All @@ -337,7 +476,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,14 +592,14 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"df.head()"
"df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"df_housing.head()"
]
},
{
Expand All @@ -483,10 +622,102 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Proximity variable created.\n",
"is_close\n",
"False 10170\n",
"True 6830\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# School coordinates (-118, 34)\n",
"SCHOOL_COORD = (-118, 34)\n",
"# Hospital coordinates (-122, 37)\n",
"HOSPITAL_COORD = (-122, 37)\n",
"DISTANCE_THRESHOLD = 0.50\n",
"\n",
"# 1. Function to calculate Euclidean distance\n",
"def euclidean_distance(lat1, lon1, lat2, lon2):\n",
" return np.sqrt((lon2 - lon1)**2 + (lat2 - lat1)**2)\n",
"\n",
"# 2. Calculate distances\n",
"df_housing['dist_to_school'] = euclidean_distance(\n",
" df_housing['latitude'], df_housing['longitude'], SCHOOL_COORD[1], SCHOOL_COORD[0]\n",
")\n",
"\n",
"df_housing['dist_to_hospital'] = euclidean_distance(\n",
" df_housing['latitude'], df_housing['longitude'], HOSPITAL_COORD[1], HOSPITAL_COORD[0]\n",
")\n",
"\n",
"# 3. Create the 'is_close' flag\n",
"# A house is considered close if dist_to_school OR dist_to_hospital is <= 0.50\n",
"df_housing['is_close'] = (df_housing['dist_to_school'] <= DISTANCE_THRESHOLD) | \\\n",
" (df_housing['dist_to_hospital'] <= DISTANCE_THRESHOLD)\n",
"\n",
"print(\"\\nProximity variable created.\")\n",
"print(df_housing['is_close'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\n",
"Mean House Value (Close): $246,934.64\n",
"Mean House Value (Far): $180,683.57\n",
"T-Statistic: 38.0333\n",
"P-Value (One-Tailed): 0.000000\n",
"\n",
"Finding: Since the p-value (0.000000) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n",
"Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\n"
]
}
],
"source": [
"# 1. Prepare data groups\n",
"close_houses = df_housing[df_housing['is_close'] == True]['median_house_value']\n",
"far_houses = df_housing[df_housing['is_close'] == False]['median_house_value']\n",
"\n",
"# 2. Perform Two-Sample T-Test (Two-tailed by default)\n",
"t_stat, p_value_two_tailed = ttest_ind(close_houses, far_houses, nan_policy='omit')\n",
"\n",
"# 3. Adjust p-value for a one-tailed test (H_a: mu_Close > mu_Far)\n",
"if t_stat > 0:\n",
" p_value_one_tailed = p_value_two_tailed / 2\n",
"else:\n",
" p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n",
"\n",
"alpha = 0.05\n",
"\n",
"print(\"\\n--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\")\n",
"print(f\"Mean House Value (Close): ${close_houses.mean():,.2f}\")\n",
"print(f\"Mean House Value (Far): ${far_houses.mean():,.2f}\")\n",
"print(f\"T-Statistic: {t_stat:.4f}\")\n",
"print(f\"P-Value (One-Tailed): {p_value_one_tailed:.6f}\")\n",
"\n",
"# Commentary:\n",
"if p_value_one_tailed < alpha:\n",
" print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n",
" print(\"Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")\n",
"else:\n",
" print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n",
" print(\"Conclusion: There is no statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")"
]
},
{
"cell_type": "code",
Expand All @@ -498,9 +729,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [conda env:base] *",
"language": "python",
"name": "python3"
"name": "conda-base-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -512,9 +743,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}