diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..3cc23c0 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,205 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import re\n", + "\n", + "rating_map = {\n", + " \"One\": 1,\n", + " \"Two\": 2,\n", + " \"Three\": 3,\n", + " \"Four\": 4,\n", + " \"Five\": 5\n", + "}\n", + "\n", + "def scrape_books(min_rating=4, max_price=20):\n", + " base_url = \"https://books.toscrape.com/catalogue/\"\n", + " rows = []\n", + "\n", + " for page in range(1, 51):\n", + " url = f\"https://books.toscrape.com/catalogue/page-{page}.html\"\n", + " r = requests.get(url)\n", + " if r.status_code != 200:\n", + " break\n", + "\n", + " soup = BeautifulSoup(r.text, \"html.parser\")\n", + " books = soup.find_all(\"article\", class_=\"product_pod\")\n", + "\n", + " for b in books:\n", + " try:\n", + " title = b.h3.a[\"title\"]\n", + " link = base_url + b.h3.a[\"href\"]\n", + " rating = rating_map[b.p[\"class\"][1]]\n", + " price = float(re.sub(r\"[^0-9.]\", \"\", b.find(\"p\", class_=\"price_color\").text))\n", + "\n", + " if rating < min_rating or price > max_price:\n", + " continue\n", + "\n", + " det = requests.get(link)\n", + " ds = BeautifulSoup(det.text, \"html.parser\")\n", + "\n", + " upc = ds.find(\"table\").find_all(\"td\")[0].text\n", + " genre = ds.find(\"ul\", class_=\"breadcrumb\").find_all(\"li\")[2].text.strip()\n", + " desc_tag = ds.find(\"div\", id=\"product_description\")\n", + " description = desc_tag.find_next(\"p\").text.strip() if desc_tag else \"\"\n", + " availability = ds.find(\"p\", class_=\"instock availability\").text.strip()\n", + "\n", + " rows.append({\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " })\n", + "\n", + " except:\n", + " continue\n", + "\n", + " return pd.DataFrame(rows)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ea882728", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | UPC | \n", + "Title | \n", + "Price (£) | \n", + "Rating | \n", + "Genre | \n", + "Availability | \n", + "Description | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "ce6396b0f23f6ecc | \n", + "Set Me Free | \n", + "17.46 | \n", + "5 | \n", + "Young Adult | \n", + "In stock (19 available) | \n", + "Aaron Ledbetterâs future had been planned ou... | \n", + "
| 1 | \n", + "6258a1f6a6dcfe50 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "17.66 | \n", + "5 | \n", + "Spirituality | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
| 2 | \n", + "6be3beb0793a53e7 | \n", + "Sophie's World | \n", + "15.94 | \n", + "5 | \n", + "Philosophy | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "
| 3 | \n", + "657fe5ead67a7767 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "14.27 | \n", + "4 | \n", + "Poetry | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "
| 4 | \n", + "51653ef291ab7ddc | \n", + "This One Summer | \n", + "19.49 | \n", + "4 | \n", + "Sequential Art | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "