data-bootcamp-v4 · ClaudiaPalladino · Nov 1, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -110,14 +110,196 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 5,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "<Response [200]>"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Import necessary libraries\n",
+        "import pandas as pd\n",
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "\n",
+        "\n",
+        "url = \"https://books.toscrape.com/\"\n",
+        "response = requests.get(url)\n",
+        "response.encoding = 'utf-8'  # Force correct encoding\n",
+        "soup = BeautifulSoup(response.text, \"html.parser\")\n",
+        "response\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "b99003f1",
+      "metadata": {},
       "outputs": [],
       "source": [
-        "# Your solution goes here"
+        "# Set filters for data extraction\n",
+        "min_rating = 4\n",
+        "max_price = 20.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "f090dad6",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Helper functions\n",
+        "\n",
+        "# Convert rating in string format to number\n",
+        "rating_to_int = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+        "\n",
+        "def parse_price(p_text: str) -> float:   # Convert price string '£51.77' to float\n",
+        "    # Remove all non-digit and non-dot characters\n",
+        "    clean_text = ''.join(c for c in p_text if c.isdigit() or c == '.')\n",
+        "    return float(clean_text)\n",
+        "\n",
+        "def parse_rating(article_tag) -> int:   # Extract numeric rating from article tag\n",
+        "    p = article_tag.find('p', class_='star-rating')\n",
+        "    for cls in p.get('class', []):\n",
+        "        if cls in rating_to_int:\n",
+        "            return rating_to_int[cls]\n",
+        "    return 0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "id": "62da0a04",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "C:\\Users\\cpall\\AppData\\Local\\Temp\\ipykernel_18124\\3419929751.py:51: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n",
+            "  upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                UPC                                              Title  \\\n",
+            "0  ce6396b0f23f6ecc                                        Set Me Free   \n",
+            "1  6258a1f6a6dcfe50  The Four Agreements: A Practical Guide to Pers...   \n",
+            "2  6be3beb0793a53e7                                     Sophie's World   \n",
+            "3  657fe5ead67a7767            Untitled Collection: Sabbath Poems 2014   \n",
+            "4  51653ef291ab7ddc                                    This One Summer   \n",
+            "\n",
+            "   Price (£)  Rating           Genre             Availability  \\\n",
+            "0      17.46       5     Young Adult  In stock (19 available)   \n",
+            "1      17.66       5    Spirituality  In stock (18 available)   \n",
+            "2      15.94       5      Philosophy  In stock (18 available)   \n",
+            "3      14.27       4          Poetry  In stock (16 available)   \n",
+            "4      19.49       4  Sequential Art  In stock (16 available)   \n",
+            "\n",
+            "                                         Description  \n",
+            "0  Aaron Ledbetter’s future had been planned out ...  \n",
+            "1  In The Four Agreements, don Miguel Ruiz reveal...  \n",
+            "2  A page-turning novel that is also an explorati...  \n",
+            "3  More than thirty-five years ago, when the weat...  \n",
+            "4  Every summer, Rose goes with her mom and dad t...  \n",
+            "Total books found: 75\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Scraping function scrapes books to Scrape and return books\n",
+        "# with rating >= min_rating and price <= max_price, returns a pandas DataFrame.\n",
+        "\n",
+        "def scrape_books(min_rating=4, max_price=20.0):\n",
+        "    base_url = \"http://books.toscrape.com/catalogue/\"\n",
+        "    page_num = 1\n",
+        "    books_data = []\n",
+        "\n",
+        "    while True:\n",
+        "        # Construct page URL\n",
+        "        url = f\"{base_url}page-{page_num}.html\"\n",
+        "        response = requests.get(url)\n",
+        "        \n",
+        "        # Stop if page does not exist\n",
+        "        if response.status_code != 200:\n",
+        "            break\n",
+        "        \n",
+        "        soup = BeautifulSoup(response.text, \"html.parser\")\n",
+        "        books = soup.find_all(\"article\", class_=\"product_pod\")\n",
+        "        \n",
+        "        for book in books:\n",
+        "            # Extract basic info\n",
+        "            title = book.h3.a[\"title\"]\n",
+        "            price = parse_price(book.find(\"p\", class_=\"price_color\").text)\n",
+        "            rating = parse_rating(book)\n",
+        "\n",
+        "            # Filter by rating and price\n",
+        "            if rating >= min_rating and price <= max_price:\n",
+        "                # Go to detail page for extra info\n",
+        "                detail_link = book.h3.a[\"href\"]\n",
+        "                # Adjust relative URL\n",
+        "                detail_url = base_url + detail_link.replace('../../../', '')\n",
+        "                detail_resp = requests.get(detail_url)\n",
+        "                detail_resp.encoding = 'utf-8'\n",
+        "                detail_soup = BeautifulSoup(detail_resp.text, \"html.parser\")\n",
+        "\n",
+        "                # Description\n",
+        "                description_tag = detail_soup.find(\"div\", id=\"product_description\")\n",
+        "                description = \"\"\n",
+        "                if description_tag:\n",
+        "                    description = description_tag.find_next_sibling(\"p\").text\n",
+        "\n",
+        "                # Genre from breadcrumb\n",
+        "                breadcrumb = detail_soup.find(\"ul\", class_=\"breadcrumb\")\n",
+        "                genre = breadcrumb.find_all(\"li\")[2].text.strip()\n",
+        "\n",
+        "                # Availability\n",
+        "                availability = detail_soup.find(\"p\", class_=\"instock availability\").text.strip()\n",
+        "\n",
+        "                # UPC\n",
+        "                upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n",
+        "                upc = upc_tag.find_next_sibling(\"td\").text.strip() if upc_tag else \"\"\n",
+        "\n",
+        "                # Store data\n",
+        "                books_data.append({\n",
+        "                    \"UPC\": upc,\n",
+        "                    \"Title\": title,\n",
+        "                    \"Price (£)\": price,\n",
+        "                    \"Rating\": rating,\n",
+        "                    \"Genre\": genre,\n",
+        "                    \"Availability\": availability,\n",
+        "                    \"Description\": description\n",
+        "                })\n",
+        "\n",
+        "        page_num += 1  # Move to next page\n",
+        "\n",
+        "    # Convert to DataFrame\n",
+        "    df = pd.DataFrame(books_data)\n",
+        "    return df\n",
+        "\n",
+        "# Example Usage\n",
+        "if __name__ == \"__main__\":\n",
+        "    df_books = scrape_books(min_rating=4, max_price=20.0)\n",
+        "\n",
+        "    # Show first few results\n",
+        "    print(df_books.head()) \n",
+        "    \n",
+        "    # Show total number of books found\n",
+        "    print(f\"Total books found: {len(df_books)}\")"
       ]
     }
   ],
@@ -126,7 +308,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "Python 3",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +322,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,