diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..d9b6270 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,65 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# H0 = there is no difference\n", + "# H1 = Dragons have higher avg HP\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reject H0: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\n" + ] + } + ], + "source": [ + "from scipy.stats import ttest_ind\n", + "\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n", + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "\n", + "# t-test\n", + "t_stat, p_value_two_tailed = ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "# Convert to one-tailed p-value (Dragon > Grass)\n", + "p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)\n", + "\n", + "t_stat, p_value_one_tailed\n", + "\n", + "if p_value_one_tailed < 0.05:\n", + " print(\"Reject H0: Dragon-type Pokémon have significantly higher HP than Grass-type Pokémon.\")\n", + "else:\n", + " print(\"Fail to reject H0: No significant evidence that Dragon-type Pokémon have higher HP.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3.3349632905124063, 0.0007993609745420598)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_stat, p_value_one_tailed" ] }, { @@ -313,11 +367,57 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# H0 = No difference in stats\n", + "# H1 = Different stats" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Multivariate linear model\n", + "================================================================\n", + " \n", + "----------------------------------------------------------------\n", + " Intercept Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n", + " Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n", + " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + " Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + "----------------------------------------------------------------\n", + " \n", + "----------------------------------------------------------------\n", + " Legendary Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n", + " Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n", + " Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n", + " Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n", + "================================================================\n", + "\n" + ] + } + ], + "source": [ + "from statsmodels.multivariate.manova import MANOVA\n", + "\n", + "stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "df_manova = df[stats_cols + ['Legendary']]\n", + "\n", + "\n", + "maov = MANOVA.from_formula('HP + Attack + Defense + Q(\"Sp. Atk\") + Q(\"Sp. Def\") + Speed ~ Legendary', data=df_manova)\n", + "result = maov.mv_test()\n", + "print(result)" ] }, { @@ -337,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -453,7 +553,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -483,22 +583,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from scipy.stats import ttest_ind\n", + "\n", + "school_coords = (-118, 34)\n", + "hospital_coords = (-122, 37)\n", + "\n", + "def euclidean_distance(lat1, lon1, lat2, lon2):\n", + " return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Close to facility house values: [124700. 137500. 169100. ... 277700. 319400. 286100.]\n", + "Far from facility house values: [ 66900. 80100. 85700. ... 103600. 85800. 94600.]\n", + "t-statistic: 37.992, p-value: 0.000\n", + "Reject H0: Houses close to a school or hospital are significantly more expensive.\n" + ] + } + ], + "source": [ + "df['dist_school'] = euclidean_distance(df['latitude'], df['longitude'], school_coords[1], school_coords[0])\n", + "df['dist_hospital'] = euclidean_distance(df['latitude'], df['longitude'], hospital_coords[1], hospital_coords[0])\n", + "\n", + "df['close_to_facility'] = (df['dist_school'] < 0.50) | (df['dist_hospital'] < 0.50)\n", + "\n", + "close_prices = df[df['close_to_facility']]['median_house_value']\n", + "far_prices = df[~df['close_to_facility']]['median_house_value']\n", + "\n", + "t_stat, p_value_two_tailed = ttest_ind(close_prices, far_prices, equal_var=False)\n", + "p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)\n", + "\n", + "\n", + "print(\"Close to facility house values:\", close_prices.values)\n", + "print(\"Far from facility house values:\", far_prices.values)\n", + "print(f\"t-statistic: {t_stat:.3f}, p-value: {p_value_one_tailed:.3f}\")\n", + "\n", + "if p_value_one_tailed < 0.05:\n", + " print(\"Reject H0: Houses close to a school or hospital are significantly more expensive.\")\n", + "else:\n", + " print(\"Fail to reject H0: No significant evidence that proximity to a school or hospital increases house prices.\")" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -512,7 +652,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4,