diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..1bec67e 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "url = \"https://books.toscrape.com/\"\n", + "response = requests.get(url)\n", + "response.encoding = 'utf-8' # Force correct encoding\n", + "soup = BeautifulSoup(response.text, \"html.parser\")\n", + "response\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b99003f1", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "# Set filters for data extraction\n", + "min_rating = 4\n", + "max_price = 20.0" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f090dad6", + "metadata": {}, + "outputs": [], + "source": [ + "# Helper functions\n", + "\n", + "# Convert rating in string format to number\n", + "rating_to_int = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "\n", + "def parse_price(p_text: str) -> float: # Convert price string '£51.77' to float\n", + " # Remove all non-digit and non-dot characters\n", + " clean_text = ''.join(c for c in p_text if c.isdigit() or c == '.')\n", + " return float(clean_text)\n", + "\n", + "def parse_rating(article_tag) -> int: # Extract numeric rating from article tag\n", + " p = article_tag.find('p', class_='star-rating')\n", + " for cls in p.get('class', []):\n", + " if cls in rating_to_int:\n", + " return rating_to_int[cls]\n", + " return 0" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "62da0a04", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\cpall\\AppData\\Local\\Temp\\ipykernel_18124\\3419929751.py:51: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n", + " upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " UPC Title \\\n", + "0 ce6396b0f23f6ecc Set Me Free \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n", + "2 6be3beb0793a53e7 Sophie's World \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n", + "4 51653ef291ab7ddc This One Summer \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.46 5 Young Adult In stock (19 available) \n", + "1 17.66 5 Spirituality In stock (18 available) \n", + "2 15.94 5 Philosophy In stock (18 available) \n", + "3 14.27 4 Poetry In stock (16 available) \n", + "4 19.49 4 Sequential Art In stock (16 available) \n", + "\n", + " Description \n", + "0 Aaron Ledbetter’s future had been planned out ... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n", + "Total books found: 75\n" + ] + } + ], + "source": [ + "# Scraping function scrapes books to Scrape and return books\n", + "# with rating >= min_rating and price <= max_price, returns a pandas DataFrame.\n", + "\n", + "def scrape_books(min_rating=4, max_price=20.0):\n", + " base_url = \"http://books.toscrape.com/catalogue/\"\n", + " page_num = 1\n", + " books_data = []\n", + "\n", + " while True:\n", + " # Construct page URL\n", + " url = f\"{base_url}page-{page_num}.html\"\n", + " response = requests.get(url)\n", + " \n", + " # Stop if page does not exist\n", + " if response.status_code != 200:\n", + " break\n", + " \n", + " soup = BeautifulSoup(response.text, \"html.parser\")\n", + " books = soup.find_all(\"article\", class_=\"product_pod\")\n", + " \n", + " for book in books:\n", + " # Extract basic info\n", + " title = book.h3.a[\"title\"]\n", + " price = parse_price(book.find(\"p\", class_=\"price_color\").text)\n", + " rating = parse_rating(book)\n", + "\n", + " # Filter by rating and price\n", + " if rating >= min_rating and price <= max_price:\n", + " # Go to detail page for extra info\n", + " detail_link = book.h3.a[\"href\"]\n", + " # Adjust relative URL\n", + " detail_url = base_url + detail_link.replace('../../../', '')\n", + " detail_resp = requests.get(detail_url)\n", + " detail_resp.encoding = 'utf-8'\n", + " detail_soup = BeautifulSoup(detail_resp.text, \"html.parser\")\n", + "\n", + " # Description\n", + " description_tag = detail_soup.find(\"div\", id=\"product_description\")\n", + " description = \"\"\n", + " if description_tag:\n", + " description = description_tag.find_next_sibling(\"p\").text\n", + "\n", + " # Genre from breadcrumb\n", + " breadcrumb = detail_soup.find(\"ul\", class_=\"breadcrumb\")\n", + " genre = breadcrumb.find_all(\"li\")[2].text.strip()\n", + "\n", + " # Availability\n", + " availability = detail_soup.find(\"p\", class_=\"instock availability\").text.strip()\n", + "\n", + " # UPC\n", + " upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n", + " upc = upc_tag.find_next_sibling(\"td\").text.strip() if upc_tag else \"\"\n", + "\n", + " # Store data\n", + " books_data.append({\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": availability,\n", + " \"Description\": description\n", + " })\n", + "\n", + " page_num += 1 # Move to next page\n", + "\n", + " # Convert to DataFrame\n", + " df = pd.DataFrame(books_data)\n", + " return df\n", + "\n", + "# Example Usage\n", + "if __name__ == \"__main__\":\n", + " df_books = scrape_books(min_rating=4, max_price=20.0)\n", + "\n", + " # Show first few results\n", + " print(df_books.head()) \n", + " \n", + " # Show total number of books found\n", + " print(f\"Total books found: {len(df_books)}\")" ] } ], @@ -126,7 +308,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +322,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.9" } }, "nbformat": 4,