From 0fab579a2bc24aa649c82c1149ec72d2e5faa15f Mon Sep 17 00:00:00 2001 From: elbgross Date: Sun, 23 Nov 2025 19:39:17 +0100 Subject: [PATCH] solved lab --- lab-hypothesis-testing.ipynb | 154 +++++++++++++++++++++++++++++++---- 1 file changed, 140 insertions(+), 14 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..092160e 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,56 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "#code here" + "#In the first place I create a serie with the data of HP per type of pokemon\n", + "\n", + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since we are comparing two means of 2 different populations i will use Two Sample T-test\n", + "\n", + "#### Hypothesis:\n", + "\n", + "H0: mu_hp dragon = mu_hp grass\n", + "\n", + "H1: mu_hp dragon != mu_hp grass\n", + "\n", + "*significance level = 0.05*" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=np.float64(3.3349632905124063), pvalue=np.float64(0.0015987219490841197), df=np.float64(50.83784116232685))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "st.ttest_ind(dragon_hp,grass_hp, equal_var=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-value is smaller than 0.05 so there is significant difference between the two means of each group. So we reject the null hypothesis" ] }, { @@ -313,11 +358,40 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "#code here" + "cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "\n", + "legendary_pok = df[df['Legendary'] == True][cols]\n", + "non_legendary_pok = df[df['Legendary'] == False][cols]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t = [ 8.03612441 10.39732102 7.18124012 14.19140621 11.03775106 9.76523433] p = [3.33064768e-15 7.82725300e-24 1.58422261e-12 6.31491577e-41\n", + " 1.84398096e-26 2.35407544e-21]\n" + ] + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(legendary_pok, non_legendary_pok)\n", + "print(\"t =\", t_stat, \"p =\", p_value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When we compared all the stats for all of them the p value is way to small compared to 0.05. We can say that the stats are different between legendary and non legendary.\n" ] }, { @@ -337,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -453,7 +527,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -483,17 +557,69 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import numpy as np\n", + "\n", + "\n", + "def euclidean_distance(x1, y1, x2, y2):\n", + " return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)\n", + "\n", + "\n", + "df[\"dist_school\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], -118, 34)\n", + "df[\"dist_hospital\"] = euclidean_distance(df[\"longitude\"], df[\"latitude\"], -122, 37)\n", + "\n", + "df[\"dist_closest\"] = df[\"dist_school\"]\n", + "df.loc[df[\"dist_hospital\"] < df[\"dist_school\"], \"dist_closest\"] = df[\"dist_hospital\"]\n", + "\n", + "close = df[df[\"dist_closest\"] < 0.5]\n", + "far = df[df[\"dist_closest\"] >= 0.5]\n", + "\n", + "close_prices = close[\"median_house_value\"]\n", + "far_prices = far[\"median_house_value\"]\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The hypothesis that we set is:\n", + "\n", + "H0= house price close = house price far \n", + "\n", + "H1 = house price close != house price far" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "TtestResult(statistic=np.float64(37.992330214201516), pvalue=np.float64(3.0064957768592614e-301), df=np.float64(14571.229910954282))" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "st.ttest_ind(close_prices,far_prices, equal_var=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p value is extremely low so we can say that is a huge diference of price if the house is close to a hospital or a school." + ] } ], "metadata": { @@ -512,7 +638,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.6" } }, "nbformat": 4,