data-bootcamp-v4 · SofiaPS-bio · Sep 29, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -111,13 +111,239 @@
     {
       "cell_type": "code",
       "execution_count": null,
+      "id": "c2caf064",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#import requests\n",
+        "#from bs4 import BeautifulSoup\n",
+        "#import pandas as pd\n",
+        "\n",
+        "#url = \"http://books.toscrape.com/\"\n",
+        "#response = requests.get(url)\n",
+        "#soup = BeautifulSoup(response.content, 'html.parser')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "998a578e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#books = soup.find_all('article', class_='product_pod')\n",
+        "#books"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "766314e2",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'A Light in the Attic'"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "#title = books[0].find('h3').find('a')['title']\n",
+        "#title"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "df312ede",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'51.77'"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "#price = books[0].find('p', class_='price_color').text[1:]\n",
+        "#price"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 20,
+      "id": "4e273e76",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_page(url):\n",
+        "    complete_url = f\"http://books.toscrape.com/{url}\"\n",
+        "    response_get = requests.get(complete_url)\n",
+        "    soup_get = BeautifulSoup(response_get.content, 'html.parser')\n",
+        "    return soup_get"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "609eb711",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "3"
+            ]
+          },
+          "execution_count": 11,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "#rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+        "#rating = rating_stars[books[0].find('p', class_='star-rating')['class'][1]]\n",
+        "\n",
+        "#rating"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 35,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Title</th>\n",
+              "      <th>Price</th>\n",
+              "      <th>Rating</th>\n",
+              "      <th>UPC</th>\n",
+              "      <th>Genre</th>\n",
+              "      <th>Availability</th>\n",
+              "      <th>Description</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Set Me Free</td>\n",
+              "      <td>17.46</td>\n",
+              "      <td>5</td>\n",
+              "      <td>ce6396b0f23f6ecc</td>\n",
+              "      <td>Young Adult</td>\n",
+              "      <td>In stock (19 available)</td>\n",
+              "      <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "         Title  Price  Rating               UPC        Genre  \\\n",
+              "0  Set Me Free  17.46       5  ce6396b0f23f6ecc  Young Adult   \n",
+              "\n",
+              "              Availability                                        Description  \n",
+              "0  In stock (19 available)  Aaron Ledbetter’s future had been planned out ...  "
+            ]
+          },
+          "execution_count": 35,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
-        "# Your solution goes here"
+        "# Your solution goes here\n",
+        "def scrape_books (min_rating, max_price):\n",
+        "    import requests\n",
+        "    from bs4 import BeautifulSoup\n",
+        "    import pandas as pd\n",
+        "\n",
+        "    url = \"http://books.toscrape.com/\"\n",
+        "    response = requests.get(url)\n",
+        "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+        "\n",
+        "    books = soup.find_all('article', class_='product_pod')\n",
+        "\n",
+        "    titles = []\n",
+        "    prices = []\n",
+        "    ratings = []\n",
+        "    upcs = []\n",
+        "    genres = []\n",
+        "    availabilities = []\n",
+        "    descriptions = []\n",
+        "\n",
+        "#from here Sofia\n",
+        "\n",
+        "    for book in books:\n",
+        "        #title = book.find('h3').find('a')['title']\n",
+        "        price = float(book.find('p', class_='price_color').text[1:])\n",
+        "        rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+        "        rating = rating_stars[book.find('p', class_='star-rating')['class'][1]]\n",
+        "        #url = book.find('h3').find('a').get('href')\n",
+        "        #upc = get_page(url).find('table', class_=\"table table-striped\").find(\"td\").text\n",
+        "\n",
+        "        if rating >= min_rating and price <= max_price:\n",
+        "            title = book.find('h3').find('a')['title']\n",
+        "            url = book.find('h3').find('a').get('href')\n",
+        "            page_content = get_page(url)\n",
+        "            upc = page_content.find('table', class_=\"table table-striped\").find(\"td\").text\n",
+        "            genre = page_content.find('ul', class_=\"breadcrumb\").find_all('li')[2].text.strip()\n",
+        "            availability = page_content.find('p', class_=\"instock availability\").get_text(strip=True)\n",
+        "            description = page_content.find('div', id=\"product_description\").next_sibling.next_sibling.get_text(strip=True)\n",
+        "            titles.append(title)\n",
+        "            prices.append(price)\n",
+        "            ratings.append(rating)\n",
+        "            upcs.append(upc)\n",
+        "            genres.append(genre)\n",
+        "            availabilities.append(availability)\n",
+        "            descriptions.append(description)\n",
+        "\n",
+        "\n",
+        "    df = pd.DataFrame({\n",
+        "        'Title': titles,\n",
+        "        'Price': prices,\n",
+        "        'Rating': ratings,\n",
+        "        'UPC': upcs,\n",
+        "        'Genre': genres,\n",
+        "        \"Availability\": availabilities,\n",
+        "        \"Description\": descriptions\n",
+        "    })\n",
+        "\n",
+        "    return df\n",
+        "\n",
+        "scrape_books(4, 20)"
       ]
     }
   ],
@@ -126,7 +352,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +366,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,