diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..487676a 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,36 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.0007993609745420597)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#code here" + "\n", + "HP_Pokemons = df[df['Type 1'] == 'Dragon']['HP']\n", + "HP_Grass = df[df['Type 1'] == 'Grass']['HP']\n", + "\n", + "# H0: Average_HP_Pokemons = Average_HP_Grass\n", + "# H1: Average_HP_Pokemons > Average_HP_Grass\n", + "\n", + "# alpha = 0.05 \n", + "\n", + "# two sample t-test --> test the means of two independent samples of scores.\n", + "_,p_value = st.ttest_ind(HP_Pokemons,HP_Grass,alternative='greater',equal_var=False)\n", + "p_value\n", + "\n", + "# conclusion- -> reject H0, so type dragon have average more HP than type grass\n", + "\n" ] }, { @@ -313,11 +338,48 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Multivariate linear model\n", + "================================================================\n", + " \n", + "----------------------------------------------------------------\n", + " Intercept Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n", + " Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n", + " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + " Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + "----------------------------------------------------------------\n", + " \n", + "----------------------------------------------------------------\n", + " Legendary Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n", + " Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n", + " Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n", + " Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n", + "================================================================\n", + "\n" + ] + } + ], "source": [ - "#code here" + "\n", + "from statsmodels.multivariate.manova import MANOVA\n", + "from patsy import dmatrix\n", + "\n", + "# H0: Legendary and Non-Legendary Pokémon have the same multivariate mean vector of stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed).\n", + "# H1: At least one of the stats’ mean differs, i.e., the multivariate mean vectors are not equal between Legendary and Non-Legendary Pokémon. \n", + "maov = MANOVA.from_formula('HP + Attack + Defense + Q(\"Sp. Atk\") + Q(\"Sp. Def\") + Speed ~ Legendary', data=df)\n", + "print(maov.mv_test())\n", + "\n", + "# by looking at Pr > F 0.0000, reject H0, so There is a statistically significant difference in the combined stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) between Legendary and Non-Legendary Pokémon." ] }, { @@ -337,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -433,34 +495,121 @@ "
17000 rows × 9 columns
\n", "" ], "text/plain": [ - " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", - "0 -114.31 34.19 15.0 5612.0 1283.0 \n", - "1 -114.47 34.40 19.0 7650.0 1901.0 \n", - "2 -114.56 33.69 17.0 720.0 174.0 \n", - "3 -114.57 33.64 14.0 1501.0 337.0 \n", - "4 -114.57 33.57 20.0 1454.0 326.0 \n", + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "... ... ... ... ... ... \n", + "16995 -124.26 40.58 52.0 2217.0 394.0 \n", + "16996 -124.27 40.69 36.0 2349.0 528.0 \n", + "16997 -124.30 41.84 17.0 2677.0 531.0 \n", + "16998 -124.30 41.80 19.0 2672.0 552.0 \n", + "16999 -124.35 40.54 52.0 1820.0 300.0 \n", "\n", - " population households median_income median_house_value \n", - "0 1015.0 472.0 1.4936 66900.0 \n", - "1 1129.0 463.0 1.8200 80100.0 \n", - "2 333.0 117.0 1.6509 85700.0 \n", - "3 515.0 226.0 3.1917 73400.0 \n", - "4 624.0 262.0 1.9250 65500.0 " + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "... ... ... ... ... \n", + "16995 907.0 369.0 2.3571 111400.0 \n", + "16996 1194.0 465.0 2.5179 79000.0 \n", + "16997 1244.0 456.0 3.0313 103600.0 \n", + "16998 1298.0 478.0 1.9797 85800.0 \n", + "16999 806.0 270.0 3.0147 94600.0 \n", + "\n", + "[17000 rows x 9 columns]" ] }, - "execution_count": 5, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "df" ] }, { @@ -483,22 +632,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import math\n", + "\n", + "school_location = (-118, 34)\n", + "hospital_location = (-122, 37)\n" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "\n", + "def calculate_distance(row,location):\n", + " return math.sqrt((row['longitude'] - location[0])**2 + (row['latitude'] - location[1])**2)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df['dis_from_school'] = df.apply(calculate_distance,args=(school_location, ),axis= 1)\n", + "df['dis_from_hospital'] = df.apply(calculate_distance,args=(hospital_location, ),axis= 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "df['close_or_far'] = df.apply(lambda row: 'close' if (row['dis_from_school'] < 0.5) or (row['dis_from_school'] < 0.5) else 'far',axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# - Divide your dataset into houses close and far from either a hospital or school.\n", + "\n", + "s_close = df[df['close_or_far'] == 'close']['median_house_value']\n", + "s_far = df[df['close_or_far'] == 'far']['median_house_value']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "F_onewayResult(statistic=np.float64(577.462645674138), pvalue=np.float64(1.6450819839186202e-125))" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# HO: houses are priced same no matter they are close to or far from school or hospital\n", + "# H1: houses close to either a school or a hospital are more expensive\n", + "\n", + "\n", + "# anova test \n", + "st.f_oneway(s_close,s_far)\n", + "\n", + "# p-value = 1.6450819839186202e-125 < 0.05, so reject H0, so houses close to either a school or a hospital are more statistically more expensive " + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py310", "language": "python", "name": "python3" }, @@ -512,7 +732,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.18" } }, "nbformat": 4,