diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..550d484 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -278,14 +278,14 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", - "df" + "df_pokemon = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df_pokemon" ] }, { @@ -297,11 +297,58 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\n", + "Mean HP Dragon: 83.31\n", + "Mean HP Grass: 67.27\n", + "T-Statistic: 3.5904\n", + "P-Value (One-Tailed): 0.0003\n", + "\n", + "Finding: Since the p-value (0.0003) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n", + "Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\n" + ] + } + ], "source": [ - "#code here" + "from scipy.stats import ttest_ind\n", + "\n", + "# 1. Prepare data groups\n", + "hp_dragon = df_pokemon[df_pokemon['Type 1'] == 'Dragon']['HP']\n", + "hp_grass = df_pokemon[df_pokemon['Type 1'] == 'Grass']['HP']\n", + "\n", + "# 2. Perform Two-Sample T-Test (Two-tailed by default)\n", + "# We assume equal variances (equal_var=False if sample sizes/STDs are very different, but we'll start with True/default)\n", + "t_stat, p_value_two_tailed = ttest_ind(hp_dragon, hp_grass, nan_policy='omit')\n", + "\n", + "# 3. Adjust p-value for a one-tailed test (H_a: mu_Dragon > mu_Grass)\n", + "# Since we are testing for \">\" (right-sided) and the T-stat is positive, we divide p-value by 2.\n", + "if t_stat > 0:\n", + " p_value_one_tailed = p_value_two_tailed / 2\n", + "else:\n", + " # If T-stat is negative, it means Dragon HP is NOT greater, so P-value is close to 1\n", + " p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n", + "\n", + "alpha = 0.05\n", + "\n", + "print(\"--- T-Test: Dragon HP vs. Grass HP (Unilateral) ---\")\n", + "print(f\"Mean HP Dragon: {hp_dragon.mean():.2f}\")\n", + "print(f\"Mean HP Grass: {hp_grass.mean():.2f}\")\n", + "print(f\"T-Statistic: {t_stat:.4f}\")\n", + "print(f\"P-Value (One-Tailed): {p_value_one_tailed:.4f}\")\n", + "\n", + "# Commentary:\n", + "if p_value_one_tailed < alpha:\n", + " print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n", + " print(\"Conclusion: There is statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")\n", + "else:\n", + " print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.4f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n", + " print(\"Conclusion: There is no statistically significant evidence to support the posit that Dragon-type Pokemons have, on average, more HP than Grass-type Pokemons.\")" ] }, { @@ -313,11 +360,103 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\n", + "\n", + "Statistic: HP\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Statistic: Attack\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Statistic: Defense\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Statistic: Sp. Atk\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Statistic: Sp. Def\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Statistic: Speed\n", + " P-Value: 0.000000\n", + " Finding: Reject H₀\n", + " Conclusion: Significant difference found\n", + "\n", + "Detailed Results:\n", + " Legendary Mean Non-Legendary Mean T-Stat P-Value\n", + "HP 92.738462 67.182313 8.036124 3.330648e-15\n", + "Attack 116.676923 75.669388 10.397321 7.827253e-24\n", + "Defense 99.661538 71.559184 7.181240 1.584223e-12\n", + "Sp. Atk 122.184615 68.454422 14.191406 6.314916e-41\n", + "Sp. Def 105.938462 68.892517 11.037751 1.843981e-26\n", + "Speed 100.184615 65.455782 9.765234 2.354075e-21\n" + ] + } + ], "source": [ - "#code here" + "# 1. Define stats columns\n", + "stat_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "\n", + "# 2. Separate data groups\n", + "legendary_stats = df_pokemon[df_pokemon['Legendary'] == True]\n", + "non_legendary_stats = df_pokemon[df_pokemon['Legendary'] == False]\n", + "\n", + "results = {}\n", + "\n", + "print(\"--- T-Test: Legendary vs. Non-Legendary Stats (Bilateral) ---\")\n", + "\n", + "# 3. Perform T-test for each stat\n", + "for stat in stat_columns:\n", + " # Extract data for the current stat\n", + " legendary_group = legendary_stats[stat]\n", + " non_legendary_group = non_legendary_stats[stat]\n", + "\n", + " # Run T-test\n", + " t_stat, p_value = ttest_ind(legendary_group, non_legendary_group, nan_policy='omit')\n", + " \n", + " # Store results\n", + " results[stat] = {\n", + " 'T-Stat': t_stat,\n", + " 'P-Value': p_value,\n", + " 'Legendary Mean': legendary_group.mean(),\n", + " 'Non-Legendary Mean': non_legendary_group.mean()\n", + " }\n", + " \n", + " # Commentary\n", + " alpha = 0.05\n", + " if p_value < alpha:\n", + " finding = \"Reject H₀\"\n", + " conclusion = \"Significant difference found\"\n", + " else:\n", + " finding = \"Fail to Reject H₀\"\n", + " conclusion = \"No significant difference found\"\n", + " \n", + " print(f\"\\nStatistic: {stat}\")\n", + " print(f\" P-Value: {p_value:.6f}\")\n", + " print(f\" Finding: {finding}\")\n", + " print(f\" Conclusion: {conclusion}\")\n", + " \n", + "# Final Summary (Optional: display the table of results)\n", + "results_df = pd.DataFrame(results).T\n", + "print(\"\\nDetailed Results:\")\n", + "print(results_df[['Legendary Mean', 'Non-Legendary Mean', 'T-Stat', 'P-Value']])" ] }, { @@ -337,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -453,14 +592,14 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df_housing.head()" ] }, { @@ -483,10 +622,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Proximity variable created.\n", + "is_close\n", + "False 10170\n", + "True 6830\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# School coordinates (-118, 34)\n", + "SCHOOL_COORD = (-118, 34)\n", + "# Hospital coordinates (-122, 37)\n", + "HOSPITAL_COORD = (-122, 37)\n", + "DISTANCE_THRESHOLD = 0.50\n", + "\n", + "# 1. Function to calculate Euclidean distance\n", + "def euclidean_distance(lat1, lon1, lat2, lon2):\n", + " return np.sqrt((lon2 - lon1)**2 + (lat2 - lat1)**2)\n", + "\n", + "# 2. Calculate distances\n", + "df_housing['dist_to_school'] = euclidean_distance(\n", + " df_housing['latitude'], df_housing['longitude'], SCHOOL_COORD[1], SCHOOL_COORD[0]\n", + ")\n", + "\n", + "df_housing['dist_to_hospital'] = euclidean_distance(\n", + " df_housing['latitude'], df_housing['longitude'], HOSPITAL_COORD[1], HOSPITAL_COORD[0]\n", + ")\n", + "\n", + "# 3. Create the 'is_close' flag\n", + "# A house is considered close if dist_to_school OR dist_to_hospital is <= 0.50\n", + "df_housing['is_close'] = (df_housing['dist_to_school'] <= DISTANCE_THRESHOLD) | \\\n", + " (df_housing['dist_to_hospital'] <= DISTANCE_THRESHOLD)\n", + "\n", + "print(\"\\nProximity variable created.\")\n", + "print(df_housing['is_close'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\n", + "Mean House Value (Close): $246,934.64\n", + "Mean House Value (Far): $180,683.57\n", + "T-Statistic: 38.0333\n", + "P-Value (One-Tailed): 0.000000\n", + "\n", + "Finding: Since the p-value (0.000000) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\n", + "Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\n" + ] + } + ], + "source": [ + "# 1. Prepare data groups\n", + "close_houses = df_housing[df_housing['is_close'] == True]['median_house_value']\n", + "far_houses = df_housing[df_housing['is_close'] == False]['median_house_value']\n", + "\n", + "# 2. Perform Two-Sample T-Test (Two-tailed by default)\n", + "t_stat, p_value_two_tailed = ttest_ind(close_houses, far_houses, nan_policy='omit')\n", + "\n", + "# 3. Adjust p-value for a one-tailed test (H_a: mu_Close > mu_Far)\n", + "if t_stat > 0:\n", + " p_value_one_tailed = p_value_two_tailed / 2\n", + "else:\n", + " p_value_one_tailed = 1 - (p_value_two_tailed / 2)\n", + "\n", + "alpha = 0.05\n", + "\n", + "print(\"\\n--- T-Test: Close Houses vs. Far Houses (Unilateral) ---\")\n", + "print(f\"Mean House Value (Close): ${close_houses.mean():,.2f}\")\n", + "print(f\"Mean House Value (Far): ${far_houses.mean():,.2f}\")\n", + "print(f\"T-Statistic: {t_stat:.4f}\")\n", + "print(f\"P-Value (One-Tailed): {p_value_one_tailed:.6f}\")\n", + "\n", + "# Commentary:\n", + "if p_value_one_tailed < alpha:\n", + " print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is less than alpha (0.05), we REJECT the Null Hypothesis (H₀).\")\n", + " print(\"Conclusion: There is statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")\n", + "else:\n", + " print(f\"\\nFinding: Since the p-value ({p_value_one_tailed:.6f}) is greater than alpha (0.05), we FAIL TO REJECT the Null Hypothesis (H₀).\")\n", + " print(\"Conclusion: There is no statistically significant evidence to support the posit that houses (neighborhoods) close to a school or a hospital are, on average, more expensive.\")" + ] }, { "cell_type": "code", @@ -498,9 +729,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "python3" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -512,9 +743,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }