data-bootcamp-v4 · lucielopez · Aug 3, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -110,14 +110,183 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 2,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
       "outputs": [],
       "source": [
-        "# Your solution goes here"
+        "import pandas as pd\n",
+        "import requests\n",
+        "from bs4 import BeautifulSoup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "593df2f0",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "<Response [200]>"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "url = \"https://books.toscrape.com/catalogue/page-1.html\"\n",
+        "response = requests.get(url)\n",
+        "response"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "id": "5b98f789",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "soup = BeautifulSoup(response.content)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "636400c6",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "books_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "f022b55f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "books = books_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3b033b71",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "   UPC          Title    Genre Availability     Description  Price (£)  Rating\n",
+            "0  N/A  404 Not Found  Unknown          N/A  No description      17.46       5\n",
+            "1  N/A  404 Not Found  Unknown          N/A  No description      17.66       5\n",
+            "2  N/A  404 Not Found  Unknown          N/A  No description      15.94       5\n",
+            "3  N/A  404 Not Found  Unknown          N/A  No description      14.27       4\n",
+            "4  N/A  404 Not Found  Unknown          N/A  No description      19.49       4\n",
+            "\n",
+            "Scraping complete. Results saved to 'filtered_books.csv'.\n"
+          ]
+        }
+      ],
+      "source": [
+        "BASE_URL = \"http://books.toscrape.com/\"\n",
+        "\n",
+        "# Map star-rating class names to numeric values\n",
+        "RATING_MAP = {\n",
+        "    \"One\": 1,\n",
+        "    \"Two\": 2,\n",
+        "    \"Three\": 3,\n",
+        "    \"Four\": 4,\n",
+        "    \"Five\": 5\n",
+        "}\n",
+        "\n",
+        "def get_full_url(relative_url):\n",
+        "    return BASE_URL + relative_url.replace(\"../\", \"\").replace(\"catalogue/\", \"\")\n",
+        "\n",
+        "def scrape_book_detail(url):\n",
+        "    response = requests.get(url)\n",
+        "    soup = BeautifulSoup(response.content, \"html.parser\")\n",
+        "\n",
+        "    title = soup.h1.text.strip()\n",
+        "\n",
+        "    description_tag = soup.select_one(\"#product_description ~ p\")\n",
+        "    description = description_tag.text.strip() if description_tag else \"No description\"\n",
+        "\n",
+        "    table = soup.select(\"table.table.table-striped tr\")\n",
+        "    upc = table[0].td.text.strip() if len(table) > 0 else \"N/A\"\n",
+        "    availability = table[5].td.text.strip() if len(table) > 5 else \"N/A\"\n",
+        "\n",
+        "    breadcrumb = soup.select(\"ul.breadcrumb li a\")\n",
+        "    genre = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else \"Unknown\"\n",
+        "\n",
+        "    return {\n",
+        "        \"UPC\": upc,\n",
+        "        \"Title\": title,\n",
+        "        \"Genre\": genre,\n",
+        "        \"Availability\": availability,\n",
+        "        \"Description\": description\n",
+        "    }\n",
+        "\n",
+        "def scrape_books(min_rating=4, max_price=20):\n",
+        "    \"\"\"Scrape books from the site matching rating and price filters.\"\"\"\n",
+        "    all_books = []\n",
+        "    page_url = \"catalogue/page-1.html\"\n",
+        "\n",
+        "    while page_url:\n",
+        "        response = requests.get(BASE_URL + page_url)\n",
+        "        soup = BeautifulSoup(response.content, \"html.parser\")\n",
+        "        book_items = soup.select(\"article.product_pod\")\n",
+        "\n",
+        "        for book in book_items:\n",
+        "            # Extract rating\n",
+        "            rating_class = book.p['class'][1]\n",
+        "            rating = RATING_MAP.get(rating_class, 0)\n",
+        "\n",
+        "            # Extract price\n",
+        "            price_text = book.select_one(\"p.price_color\").text.strip().replace(\"£\", \"\")\n",
+        "            price = float(price_text)\n",
+        "\n",
+        "            if rating >= min_rating and price <= max_price:\n",
+        "                # Link to book detail page\n",
+        "                detail_href = book.h3.a['href']\n",
+        "                detail_url = get_full_url(detail_href)\n",
+        "\n",
+        "                # Get detailed book data\n",
+        "                book_data = scrape_book_detail(detail_url)\n",
+        "\n",
+        "                # Add price and rating to book_data\n",
+        "                book_data.update({\n",
+        "                    \"Price (£)\": price,\n",
+        "                    \"Rating\": rating\n",
+        "                })\n",
+        "\n",
+        "                all_books.append(book_data)\n",
+        "\n",
+        "        # Go to next page if available\n",
+        "        next_button = soup.select_one(\"li.next > a\")\n",
+        "        if next_button:\n",
+        "            next_href = next_button['href']\n",
+        "            page_url = \"catalogue/\" + next_href\n",
+        "        else:\n",
+        "            page_url = None\n",
+        "\n",
+        "\n",
+        "    return pd.DataFrame(all_books)\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    # Example: get books with rating >= 4 and price <= 20\n",
+        "    df = scrape_books(min_rating=4, max_price=20)\n",
+        "    print(df.head())\n",
+        "    df.to_csv(\"filtered_books.csv\", index=False)\n",
+        "    print(\"\\nScraping complete. Results saved to 'filtered_books.csv'.\")\n"
       ]
     }
   ],
@@ -126,7 +295,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +309,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.12.2"
     }
   },
   "nbformat": 4,