data-bootcamp-v4 · MiaZhou112 · Aug 30, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -278,7 +278,7 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -297,11 +297,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "np.float64(0.0007993609745420597)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "#code here"
+    "\n",
+    "HP_Pokemons = df[df['Type 1'] == 'Dragon']['HP']\n",
+    "HP_Grass = df[df['Type 1'] == 'Grass']['HP']\n",
+    "\n",
+    "# H0: Average_HP_Pokemons = Average_HP_Grass\n",
+    "# H1: Average_HP_Pokemons > Average_HP_Grass\n",
+    "\n",
+    "# alpha = 0.05 \n",
+    "\n",
+    "# two sample t-test --> test the means of two independent samples of scores.\n",
+    "_,p_value = st.ttest_ind(HP_Pokemons,HP_Grass,alternative='greater',equal_var=False)\n",
+    "p_value\n",
+    "\n",
+    "# conclusion- -> reject H0, so type dragon have average more HP than type grass\n",
+    "\n"
    ]
   },
   {
@@ -313,11 +338,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Multivariate linear model\n",
+      "================================================================\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "       Intercept         Value  Num DF  Den DF   F Value  Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "          Wilks' lambda  0.0592 6.0000 793.0000 2100.8338 0.0000\n",
+      "         Pillai's trace  0.9408 6.0000 793.0000 2100.8338 0.0000\n",
+      " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "    Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "----------------------------------------------------------------\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "          Legendary        Value  Num DF  Den DF  F Value Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "             Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
+      "            Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
+      "    Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "       Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "================================================================\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "\n",
+    "from statsmodels.multivariate.manova import MANOVA\n",
+    "from patsy import dmatrix\n",
+    "\n",
+    "# H0: Legendary and Non-Legendary Pokémon have the same multivariate mean vector of stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed).\n",
+    "# H1: At least one of the stats’ mean differs, i.e., the multivariate mean vectors are not equal between Legendary and Non-Legendary Pokémon. \n",
+    "maov = MANOVA.from_formula('HP + Attack + Defense + Q(\"Sp. Atk\") + Q(\"Sp. Def\") + Speed ~ Legendary', data=df)\n",
+    "print(maov.mv_test())\n",
+    "\n",
+    "# by looking at Pr > F  0.0000, reject H0, so There is a statistically significant difference in the combined stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) between Legendary and Non-Legendary Pokémon."
    ]
   },
   {
@@ -337,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -433,34 +495,121 @@
        "      <td>1.9250</td>\n",
        "      <td>65500.0</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16995</th>\n",
+       "      <td>-124.26</td>\n",
+       "      <td>40.58</td>\n",
+       "      <td>52.0</td>\n",
+       "      <td>2217.0</td>\n",
+       "      <td>394.0</td>\n",
+       "      <td>907.0</td>\n",
+       "      <td>369.0</td>\n",
+       "      <td>2.3571</td>\n",
+       "      <td>111400.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16996</th>\n",
+       "      <td>-124.27</td>\n",
+       "      <td>40.69</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>2349.0</td>\n",
+       "      <td>528.0</td>\n",
+       "      <td>1194.0</td>\n",
+       "      <td>465.0</td>\n",
+       "      <td>2.5179</td>\n",
+       "      <td>79000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16997</th>\n",
+       "      <td>-124.30</td>\n",
+       "      <td>41.84</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>2677.0</td>\n",
+       "      <td>531.0</td>\n",
+       "      <td>1244.0</td>\n",
+       "      <td>456.0</td>\n",
+       "      <td>3.0313</td>\n",
+       "      <td>103600.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16998</th>\n",
+       "      <td>-124.30</td>\n",
+       "      <td>41.80</td>\n",
+       "      <td>19.0</td>\n",
+       "      <td>2672.0</td>\n",
+       "      <td>552.0</td>\n",
+       "      <td>1298.0</td>\n",
+       "      <td>478.0</td>\n",
+       "      <td>1.9797</td>\n",
+       "      <td>85800.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16999</th>\n",
+       "      <td>-124.35</td>\n",
+       "      <td>40.54</td>\n",
+       "      <td>52.0</td>\n",
+       "      <td>1820.0</td>\n",
+       "      <td>300.0</td>\n",
+       "      <td>806.0</td>\n",
+       "      <td>270.0</td>\n",
+       "      <td>3.0147</td>\n",
+       "      <td>94600.0</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>17000 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
-       "0    -114.31     34.19                15.0       5612.0          1283.0   \n",
-       "1    -114.47     34.40                19.0       7650.0          1901.0   \n",
-       "2    -114.56     33.69                17.0        720.0           174.0   \n",
-       "3    -114.57     33.64                14.0       1501.0           337.0   \n",
-       "4    -114.57     33.57                20.0       1454.0           326.0   \n",
+       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
+       "0        -114.31     34.19                15.0       5612.0          1283.0   \n",
+       "1        -114.47     34.40                19.0       7650.0          1901.0   \n",
+       "2        -114.56     33.69                17.0        720.0           174.0   \n",
+       "3        -114.57     33.64                14.0       1501.0           337.0   \n",
+       "4        -114.57     33.57                20.0       1454.0           326.0   \n",
+       "...          ...       ...                 ...          ...             ...   \n",
+       "16995    -124.26     40.58                52.0       2217.0           394.0   \n",
+       "16996    -124.27     40.69                36.0       2349.0           528.0   \n",
+       "16997    -124.30     41.84                17.0       2677.0           531.0   \n",
+       "16998    -124.30     41.80                19.0       2672.0           552.0   \n",
+       "16999    -124.35     40.54                52.0       1820.0           300.0   \n",
        "\n",
-       "   population  households  median_income  median_house_value  \n",
-       "0      1015.0       472.0         1.4936             66900.0  \n",
-       "1      1129.0       463.0         1.8200             80100.0  \n",
-       "2       333.0       117.0         1.6509             85700.0  \n",
-       "3       515.0       226.0         3.1917             73400.0  \n",
-       "4       624.0       262.0         1.9250             65500.0  "
+       "       population  households  median_income  median_house_value  \n",
+       "0          1015.0       472.0         1.4936             66900.0  \n",
+       "1          1129.0       463.0         1.8200             80100.0  \n",
+       "2           333.0       117.0         1.6509             85700.0  \n",
+       "3           515.0       226.0         3.1917             73400.0  \n",
+       "4           624.0       262.0         1.9250             65500.0  \n",
+       "...           ...         ...            ...                 ...  \n",
+       "16995       907.0       369.0         2.3571            111400.0  \n",
+       "16996      1194.0       465.0         2.5179             79000.0  \n",
+       "16997      1244.0       456.0         3.0313            103600.0  \n",
+       "16998      1298.0       478.0         1.9797             85800.0  \n",
+       "16999       806.0       270.0         3.0147             94600.0  \n",
+       "\n",
+       "[17000 rows x 9 columns]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
-    "df.head()"
+    "df"
    ]
   },
   {
@@ -483,22 +632,93 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import math\n",
+    "\n",
+    "school_location = (-118, 34)\n",
+    "hospital_location = (-122, 37)\n"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n",
+    "\n",
+    "def calculate_distance(row,location):\n",
+    "    return math.sqrt((row['longitude'] - location[0])**2 + (row['latitude'] - location[1])**2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df['dis_from_school'] = df.apply(calculate_distance,args=(school_location, ),axis= 1)\n",
+    "df['dis_from_hospital'] = df.apply(calculate_distance,args=(hospital_location, ),axis= 1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n",
+    "\n",
+    "df['close_or_far'] = df.apply(lambda row: 'close' if (row['dis_from_school'] < 0.5) or (row['dis_from_school'] < 0.5) else 'far',axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# - Divide your dataset into houses close and far from either a hospital or school.\n",
+    "\n",
+    "s_close = df[df['close_or_far'] == 'close']['median_house_value']\n",
+    "s_far = df[df['close_or_far'] == 'far']['median_house_value']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "F_onewayResult(statistic=np.float64(577.462645674138), pvalue=np.float64(1.6450819839186202e-125))"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# HO: houses are priced same no matter they are close to or far from school or hospital\n",
+    "# H1: houses close to either a school or a hospital are more expensive\n",
+    "\n",
+    "\n",
+    "# anova test \n",
+    "st.f_oneway(s_close,s_far)\n",
+    "\n",
+    "# p-value = 1.6450819839186202e-125 < 0.05, so reject H0,  so houses close to either a school or a hospital are more statistically more expensive "
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "py310",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +732,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.10.18"
   }
  },
  "nbformat": 4,