From f5350119a89735ea632ef1087a7cecb44a777872 Mon Sep 17 00:00:00 2001 From: Mauricio Bengochea Torres Date: Fri, 12 Sep 2025 13:56:29 +0200 Subject: [PATCH] Lab solution to lab-web-scraping.ipynb --- lab-web-scraping.ipynb | 110 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 5 deletions(-) diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..816d64c 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,114 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\LAP-MPC\\AppData\\Local\\Temp\\ipykernel_10456\\493026631.py:42: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n", + " upc = book_soup.find(text='UPC').find_next('td').text\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned ou... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n" + ] + } + ], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "def get_rating(rating_class):\n", + " \"\"\"Convert star rating class names to numerical ratings.\"\"\"\n", + " ratings_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + " return ratings_map.get(rating_class, 0)\n", + "\n", + "def scrape_books(min_rating=4.0, max_price=20.0):\n", + " base_url = 'http://books.toscrape.com/catalogue/page-{}.html'\n", + " books_data = []\n", + "\n", + " page = 1\n", + " while True:\n", + " response = requests.get(base_url.format(page))\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " \n", + " # Check if there are books on the current page\n", + " books = soup.select('article.product_pod')\n", + " if not books:\n", + " break\n", + "\n", + " for book in books:\n", + " rating_class = book.p['class'][1]\n", + " rating = get_rating(rating_class)\n", + " title = book.h3.a['title']\n", + " \n", + " # Properly handle the price by removing non-numeric prefixes\n", + " price_text = book.find('p', class_='price_color').text\n", + " price_cleaned = ''.join(filter(lambda x: x.isdigit() or x=='.', price_text))\n", + " price = float(price_cleaned)\n", + "\n", + " if rating >= min_rating and price <= max_price:\n", + " # Extract the link and fetch additional book details\n", + " relative_link = book.h3.a['href'].replace('../../../', '')\n", + " book_url = f\"http://books.toscrape.com/catalogue/{relative_link}\"\n", + " book_response = requests.get(book_url)\n", + " book_soup = BeautifulSoup(book_response.text, 'html.parser')\n", + " \n", + " # Extract additional book details\n", + " upc = book_soup.find(text='UPC').find_next('td').text\n", + " genre = book_soup.find('ul', class_='breadcrumb').findAll('a')[2].text\n", + " availability = book_soup.find('p', class_='instock availability').text.strip()\n", + " description_tag = book_soup.find('meta', attrs={'name': 'description'})\n", + " description = description_tag['content'].strip() if description_tag else 'N/A'\n", + "\n", + " books_data.append({\n", + " 'UPC': upc,\n", + " 'Title': title,\n", + " 'Price (£)': price,\n", + " 'Rating': rating,\n", + " 'Genre': genre,\n", + " 'Availability': availability,\n", + " 'Description': description\n", + " })\n", + "\n", + " page += 1\n", + "\n", + " # Create a DataFrame with the book data\n", + " books_df = pd.DataFrame(books_data)\n", + " return books_df\n", + "\n", + "# Execute the function\n", + "books_df_result = scrape_books(min_rating=4.0, max_price=20.0)\n", + "print(books_df_result.head())" ] } ], @@ -126,7 +226,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +240,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,