diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..804a3a4 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,15 +38,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { @@ -297,11 +296,73 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# H0 : The average HP of Dragon = average of Grass\n", + "# H1 : The average HP of Dragon > average of Grass\n", + "\n", + "# one-sided two-sample t-test" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "69.25875" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"HP\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t-statistic: 3.3349632905124063\n", + "p-value (bilateral): 0.0015987219490841195\n", + "✅ H0 rejected: Dragons have significantly more HP than Grass (5%).\n" + ] + } + ], + "source": [ + "t_stat, p_val = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "print(\"t-statistic:\", t_stat)\n", + "print(\"p-value (bilateral):\", p_val)\n", + "\n", + "# Unilateral hypothesis (Dragon > Grass),\n", + "# we divide p_val by 2 and check if the average of the Dragons is much greater\n", + "if (dragon_hp.mean() > grass_hp.mean()) and (p_val/2 < 0.05):\n", + " print(\"✅ H0 rejected: Dragons have significantly more HP than Grass (5%).\")\n", + "else:\n", + " print(\"❌ H0 cannot be rejected at the 5% threshold.\")" ] }, { @@ -313,11 +374,53 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#code here" + "# H0 : Legendaries and non-legendaries have the same stats\n", + "# H1 : Legendaries and non-legendaries do not have the same stats" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP | mean Legendary=92.7, mean NonLegendary=67.2, F=64.58, p=0.0000\n", + "Attack | mean Legendary=116.7, mean NonLegendary=75.7, F=108.10, p=0.0000\n", + "Defense | mean Legendary=99.7, mean NonLegendary=71.6, F=51.57, p=0.0000\n", + "Sp. Atk | mean Legendary=122.2, mean NonLegendary=68.5, F=201.40, p=0.0000\n", + "Sp. Def | mean Legendary=105.9, mean NonLegendary=68.9, F=121.83, p=0.0000\n", + "Speed | mean Legendary=100.2, mean NonLegendary=65.5, F=95.36, p=0.0000\n" + ] + } + ], + "source": [ + "legendary = df[df[\"Legendary\"] == True]\n", + "nonlegend = df[df[\"Legendary\"] == False]\n", + "\n", + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "for s in stats:\n", + " f, p = st.f_oneway(legendary[s], nonlegend[s])\n", + " print(f\"{s:7s} | mean Legendary={legendary[s].mean():.1f}, mean NonLegendary={nonlegend[s].mean():.1f}, F={f:.2f}, p={p:.4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We reject H0\n", + "\n", + "# This means that Legendary Pokémon have significantly different (and on average higher) stats than non-Legendaries in all dimensions tested \n", + "# (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed)." ] }, { @@ -337,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -453,7 +556,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -486,19 +589,65 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# H0 : The average price of nearby houses (≤ 0.5) is no different from that of distant houses (> 0.5).\n", + "# H1 : The average price of nearby houses is higher than that of distant houses." + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Close mean: 246934.639238653\n", + "Far mean: 180683.57168141592\n", + "t-stat: 37.97959304116918\n", + "p-value (bilateral): 4.642199519694926e-301\n", + "✅ H0 rejected : nearby houses are significantly more expensive (5%).\n" + ] + } + ], + "source": [ + "school = (-118, 34)\n", + "hospital = (-122, 37)\n", + "\n", + "def euclidean_distance(x1, y1, x2, y2):\n", + " return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)\n", + "\n", + "# Distance\n", + "df[\"dist_school\"] = df.apply(lambda row: euclidean_distance(row[\"longitude\"], row[\"latitude\"], *school), axis=1)\n", + "df[\"dist_hospital\"] = df.apply(lambda row: euclidean_distance(row[\"longitude\"], row[\"latitude\"], *hospital), axis=1)\n", + "\n", + "# Variable indicatrice \"close\"\n", + "df[\"close\"] = ((df[\"dist_school\"] <= 0.5) | (df[\"dist_hospital\"] <= 0.5))\n", + "\n", + "# Separate groups\n", + "close_prices = df[df[\"close\"] == True][\"median_house_value\"]\n", + "far_prices = df[df[\"close\"] == False][\"median_house_value\"]\n", + "\n", + "# T Test\n", + "t_stat, p_val = st.ttest_ind(close_prices, far_prices, equal_var=False)\n", + "\n", + "print(\"Close mean:\", close_prices.mean())\n", + "print(\"Far mean:\", far_prices.mean())\n", + "print(\"t-stat:\", t_stat)\n", + "print(\"p-value (bilateral):\", p_val)\n", + "\n", + "# Unilateral Test (close > far)\n", + "if (close_prices.mean() > far_prices.mean()) and (p_val/2 < 0.05):\n", + " print(\"✅ H0 rejected : nearby houses are significantly more expensive (5%).\")\n", + "else:\n", + " print(\"❌ H0 cannot be rejected at the 5% threshold.\")\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -512,7 +661,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.2" } }, "nbformat": 4,