diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..3cc23c0 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,205 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import re\n", + "\n", + "rating_map = {\n", + " \"One\": 1,\n", + " \"Two\": 2,\n", + " \"Three\": 3,\n", + " \"Four\": 4,\n", + " \"Five\": 5\n", + "}\n", + "\n", + "def scrape_books(min_rating=4, max_price=20):\n", + " base_url = \"https://books.toscrape.com/catalogue/\"\n", + " rows = []\n", + "\n", + " for page in range(1, 51):\n", + " url = f\"https://books.toscrape.com/catalogue/page-{page}.html\"\n", + " r = requests.get(url)\n", + " if r.status_code != 200:\n", + " break\n", + "\n", + " soup = BeautifulSoup(r.text, \"html.parser\")\n", + " books = soup.find_all(\"article\", class_=\"product_pod\")\n", + "\n", + " for b in books:\n", + " try:\n", + " title = b.h3.a[\"title\"]\n", + " link = base_url + b.h3.a[\"href\"]\n", + " rating = rating_map[b.p[\"class\"][1]]\n", + " price = float(re.sub(r\"[^0-9.]\", \"\", b.find(\"p\", class_=\"price_color\").text))\n", + "\n", + " if rating < min_rating or price > max_price:\n", + " continue\n", + "\n", + " det = requests.get(link)\n", + " ds = BeautifulSoup(det.text, \"html.parser\")\n", + "\n", + " upc = ds.find(\"table\").find_all(\"td\")[0].text\n", + " genre = ds.find(\"ul\", class_=\"breadcrumb\").find_all(\"li\")[2].text.strip()\n", + " desc_tag = ds.find(\"div\", id=\"product_description\")\n", + " description = desc_tag.find_next(\"p\").text.strip() if desc_tag else \"\"\n", + " availability = ds.find(\"p\", class_=\"instock availability\").text.strip()\n", + "\n", + " rows.append({\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " })\n", + "\n", + " except:\n", + " continue\n", + "\n", + " return pd.DataFrame(rows)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ea882728", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UPCTitlePrice (£)RatingGenreAvailabilityDescription
0ce6396b0f23f6eccSet Me Free17.465Young AdultIn stock (19 available)Aaron Ledbetter’s future had been planned ou...
16258a1f6a6dcfe50The Four Agreements: A Practical Guide to Pers...17.665SpiritualityIn stock (18 available)In The Four Agreements, don Miguel Ruiz reveal...
26be3beb0793a53e7Sophie's World15.945PhilosophyIn stock (18 available)A page-turning novel that is also an explorati...
3657fe5ead67a7767Untitled Collection: Sabbath Poems 201414.274PoetryIn stock (16 available)More than thirty-five years ago, when the weat...
451653ef291ab7ddcThis One Summer19.494Sequential ArtIn stock (16 available)Every summer, Rose goes with her mom and dad t...
\n", + "
" + ], + "text/plain": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned ou... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = scrape_books(min_rating=4, max_price=20)\n", + "df.head()\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88bac121", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -126,7 +316,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +330,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,