From fefddf4a59d929f6780c0799f31385be57bf36a7 Mon Sep 17 00:00:00 2001 From: Lucie Lopez Date: Sun, 3 Aug 2025 22:22:58 +0200 Subject: [PATCH] Lab OK --- lab-web-scraping.ipynb | 177 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..5a40ca8 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,183 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "593df2f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + "response = requests.get(url)\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5b98f789", + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(response.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "636400c6", + "metadata": {}, + "outputs": [], + "source": [ + "books_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f022b55f", + "metadata": {}, + "outputs": [], + "source": [ + "books = books_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b033b71", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UPC Title Genre Availability Description Price (£) Rating\n", + "0 N/A 404 Not Found Unknown N/A No description 17.46 5\n", + "1 N/A 404 Not Found Unknown N/A No description 17.66 5\n", + "2 N/A 404 Not Found Unknown N/A No description 15.94 5\n", + "3 N/A 404 Not Found Unknown N/A No description 14.27 4\n", + "4 N/A 404 Not Found Unknown N/A No description 19.49 4\n", + "\n", + "Scraping complete. Results saved to 'filtered_books.csv'.\n" + ] + } + ], + "source": [ + "BASE_URL = \"http://books.toscrape.com/\"\n", + "\n", + "# Map star-rating class names to numeric values\n", + "RATING_MAP = {\n", + " \"One\": 1,\n", + " \"Two\": 2,\n", + " \"Three\": 3,\n", + " \"Four\": 4,\n", + " \"Five\": 5\n", + "}\n", + "\n", + "def get_full_url(relative_url):\n", + " return BASE_URL + relative_url.replace(\"../\", \"\").replace(\"catalogue/\", \"\")\n", + "\n", + "def scrape_book_detail(url):\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + " title = soup.h1.text.strip()\n", + "\n", + " description_tag = soup.select_one(\"#product_description ~ p\")\n", + " description = description_tag.text.strip() if description_tag else \"No description\"\n", + "\n", + " table = soup.select(\"table.table.table-striped tr\")\n", + " upc = table[0].td.text.strip() if len(table) > 0 else \"N/A\"\n", + " availability = table[5].td.text.strip() if len(table) > 5 else \"N/A\"\n", + "\n", + " breadcrumb = soup.select(\"ul.breadcrumb li a\")\n", + " genre = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else \"Unknown\"\n", + "\n", + " return {\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " }\n", + "\n", + "def scrape_books(min_rating=4, max_price=20):\n", + " \"\"\"Scrape books from the site matching rating and price filters.\"\"\"\n", + " all_books = []\n", + " page_url = \"catalogue/page-1.html\"\n", + "\n", + " while page_url:\n", + " response = requests.get(BASE_URL + page_url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + " book_items = soup.select(\"article.product_pod\")\n", + "\n", + " for book in book_items:\n", + " # Extract rating\n", + " rating_class = book.p['class'][1]\n", + " rating = RATING_MAP.get(rating_class, 0)\n", + "\n", + " # Extract price\n", + " price_text = book.select_one(\"p.price_color\").text.strip().replace(\"£\", \"\")\n", + " price = float(price_text)\n", + "\n", + " if rating >= min_rating and price <= max_price:\n", + " # Link to book detail page\n", + " detail_href = book.h3.a['href']\n", + " detail_url = get_full_url(detail_href)\n", + "\n", + " # Get detailed book data\n", + " book_data = scrape_book_detail(detail_url)\n", + "\n", + " # Add price and rating to book_data\n", + " book_data.update({\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating\n", + " })\n", + "\n", + " all_books.append(book_data)\n", + "\n", + " # Go to next page if available\n", + " next_button = soup.select_one(\"li.next > a\")\n", + " if next_button:\n", + " next_href = next_button['href']\n", + " page_url = \"catalogue/\" + next_href\n", + " else:\n", + " page_url = None\n", + "\n", + "\n", + " return pd.DataFrame(all_books)\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Example: get books with rating >= 4 and price <= 20\n", + " df = scrape_books(min_rating=4, max_price=20)\n", + " print(df.head())\n", + " df.to_csv(\"filtered_books.csv\", index=False)\n", + " print(\"\\nScraping complete. Results saved to 'filtered_books.csv'.\")\n" ] } ], @@ -126,7 +295,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.2" } }, "nbformat": 4,