diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..cd46201 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,188 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, - "outputs": [], + "execution_count": 2, + "id": "eeaba4af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 75 books\n", + " Title Category Price (£) Rating InStock StockCount Description URL\n", + " Set Me Free Young Adult 17.46 5 True None Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and Aaron Ledbetter’s future had been planned out for him since before he was born. Each year, the Ledbetter family vacation on Tybee Island gave Aaron a chance to briefly free himself from his family’s expectations. When he meets Jonas “Lucky” Luckett, a caricature artist in town with the traveling carnival, he must choose between the life that’s been mapped out for him, and the chance at true love. ...more http://books.toscrape.com/catalogue/set-me-free_988/index.html\n", + " The Four Agreements: A Practical Guide to Personal Freedom Spirituality 17.66 5 True None In The Four Agreements, don Miguel Ruiz reveals the source of self-limiting beliefs that rob us of joy and create needless suffering. Based on ancient Toltec wisdom, the Four Agreements offer a powerful code of conduct that can rapidly transform our lives to a new experience of freedom, true happiness, and love. The Four Agreements are: Be Impeccable With Your Word, Don't In The Four Agreements, don Miguel Ruiz reveals the source of self-limiting beliefs that rob us of joy and create needless suffering. Based on ancient Toltec wisdom, the Four Agreements offer a powerful code of conduct that can rapidly transform our lives to a new experience of freedom, true happiness, and love. The Four Agreements are: Be Impeccable With Your Word, Don't Take Anything Personally, Don't Make Assumptions, Always Do Your Best. ...more http://books.toscrape.com/catalogue/the-four-agreements-a-practical-guide-to-personal-freedom_970/index.html\n", + " Sophie's World Philosophy 15.94 5 True None A page-turning novel that is also an exploration of the great philosophical concepts of Western thought, Sophie’s World has fired the imagination of readers all over the world, with more than twenty million copies in print.One day fourteen-year-old Sophie Amundsen comes home from school to find in her mailbox two notes, with one question on each: “Who are you?” and “Where A page-turning novel that is also an exploration of the great philosophical concepts of Western thought, Sophie’s World has fired the imagination of readers all over the world, with more than twenty million copies in print.One day fourteen-year-old Sophie Amundsen comes home from school to find in her mailbox two notes, with one question on each: “Who are you?” and “Where does the world come from?” From that irresistible beginning, Sophie becomes obsessed with questions that take her far beyond what she knows of her Norwegian village. Through those letters, she enrolls in a kind of correspondence course, covering Socrates to Sartre, with a mysterious philosopher, while receiving letters addressed to another girl. Who is Hilde? And why does her mail keep turning up? To unravel this riddle, Sophie must use the philosophy she is learning—but the truth turns out to be far more complicated than she could have imagined. ...more http://books.toscrape.com/catalogue/sophies-world_966/index.html\n", + " Untitled Collection: Sabbath Poems 2014 Poetry 14.27 4 True None More than thirty-five years ago, when the weather allowed, Wendell Berry began spending his sabbaths outdoors, walking and wandering around familiar territory, seeking a deep intimacy only time could provide. These walks arranged themselves into poems and each year since he has completed a sequence dated by the year of its composition. Last year we collected the lot into a More than thirty-five years ago, when the weather allowed, Wendell Berry began spending his sabbaths outdoors, walking and wandering around familiar territory, seeking a deep intimacy only time could provide. These walks arranged themselves into poems and each year since he has completed a sequence dated by the year of its composition. Last year we collected the lot into a collection, This Day, the Sabbath Poems 1979-2013. This new sequence for the following year is one of the richest yet. This group provides a virtual syllabus for all of Mr. Berry’s cultural and agricultural work in concentrated form. Many of these poems are drawn from the view from a small porch in the woods, a place of stillness and reflection, a vantage point “of the one/life of the forest composed/of uncountable lives in countless/years each life coherent itself within/ the coherence, the great composure,/of all.” A new collection of Wendell Berry poems is always an occasion of joyful celebration and this one is especially so. ...more http://books.toscrape.com/catalogue/untitled-collection-sabbath-poems-2014_953/index.html\n", + " This One Summer Sequential Art 19.49 4 True None Every summer, Rose goes with her mom and dad to a lake house in Awago Beach. It's their getaway, their refuge. Rosie's friend Windy is always there, too, like the little sister she never had. But this summer is different. Rose's mom and dad won't stop fighting, and when Rose and Windy seek a distraction from the drama, they find themselves with a whole new set of problems. Every summer, Rose goes with her mom and dad to a lake house in Awago Beach. It's their getaway, their refuge. Rosie's friend Windy is always there, too, like the little sister she never had. But this summer is different. Rose's mom and dad won't stop fighting, and when Rose and Windy seek a distraction from the drama, they find themselves with a whole new set of problems. It's a summer of secrets and sorrow and growing up, and it's a good thing Rose and Windy have each other.In This One Summer two stellar creators redefine the teen graphic novel. Cousins Mariko and Jillian Tamaki, the team behind Skim, have collaborated on this gorgeous, heartbreaking, and ultimately hopeful story about a girl on the cusp of her teen age — a story of renewal and revelation. ...more http://books.toscrape.com/catalogue/this-one-summer_947/index.html\n", + " Thirst Fiction 17.27 5 True None On a searing summer Friday, Eddie Chapman has been stuck for hours in a traffic jam. There are accidents along the highway, but ambulances and police are conspicuously absent. When he decides to abandon his car and run home, he sees that the trees along the edge of a stream have been burnt, and the water in the streambed is gone. Something is very wrong.When he arrives hom On a searing summer Friday, Eddie Chapman has been stuck for hours in a traffic jam. There are accidents along the highway, but ambulances and police are conspicuously absent. When he decides to abandon his car and run home, he sees that the trees along the edge of a stream have been burnt, and the water in the streambed is gone. Something is very wrong.When he arrives home, the power is out and there is no running water. The pipes everywhere, it seems, have gone dry. Eddie and his wife, Laura, find themselves thrust together with their neighbors while a sense of unease thickens in the stifling night air. Thirst takes place in the immediate aftermath of a mysterious disaster--the Chapmans and their neighbors suffer the effects of the heat, their thirst, and the terrifying realization that no one is coming to help. As violence rips through the community, Eddie and Laura are forced to recall secrets from their past and question their present humanity. In crisp and convincing prose, Ben Warner compels readers to do the same. What might you do to survive? ...more http://books.toscrape.com/catalogue/thirst_946/index.html\n", + " Princess Jellyfish 2-in-1 Omnibus, Vol. 01 (Princess Jellyfish 2-in-1 Omnibus #1) Sequential Art 13.61 5 True None THE LONG-AWAITED STORY OF FANGIRLS TAKING ON TOKYO!Special large-size 2-in-1 edition of over 400 pages!\"One of the best anime and manga for beginners. Enthusiasm - geeky and otherwise - is power in Princess Jellyfish. Enthusiasm saves the day and paves the road to the future.\" - Kotaku\"Princess Jellyfish's ambition is simple: to tell a delightful story in a delightful way. THE LONG-AWAITED STORY OF FANGIRLS TAKING ON TOKYO!Special large-size 2-in-1 edition of over 400 pages!\"One of the best anime and manga for beginners. Enthusiasm - geeky and otherwise - is power in Princess Jellyfish. Enthusiasm saves the day and paves the road to the future.\" - Kotaku\"Princess Jellyfish's ambition is simple: to tell a delightful story in a delightful way... It's a pretty deadly one-two punch.\" - Anime News Network\"Loaded with heart, soul, humor and insight.\" - About.comSTINGING BEAUTY Tsukimi Kurashita has a strange fascination with jellyfish. She’s loved them from a young age and has carried that love with her to her new life in the big city of Tokyo. There, she resides in Amamizukan, a safe-haven for girl geeks who regularly gush over a range of things from trains to Japanese dolls. However, a chance meeting at a pet shop has Tsukimi crossing paths with one of the things that the residents of Amamizukan have been desperately trying to avoid—a beautiful and fashionable woman! But there’s much more to this woman than her trendy clothes! This odd encounter is only the beginning of a new and unexpected path for Tsukimi and her friends. ...more http://books.toscrape.com/catalogue/princess-jellyfish-2-in-1-omnibus-vol-01-princess-jellyfish-2-in-1-omnibus-1_920/index.html\n", + " Princess Between Worlds (Wide-Awake Princess #5) Fantasy 13.34 5 True None Just as Annie and Liam are busy making plans to travel the world, a witch shows up and gives them a collection of postcards from the Magic Marketplace. Each postcard gives Annie and Liam the opportunity to travel to exotic lands and far-flung kingdoms. What the witch doesn't give them are directions on how to safely return. http://books.toscrape.com/catalogue/princess-between-worlds-wide-awake-princess-5_919/index.html\n", + " Outcast, Vol. 1: A Darkness Surrounds Him (Outcast #1) Sequential Art 15.44 4 True None NEW HORROR SERIES FROM THE WALKING DEAD CREATOR ROBERT KIRKMAN! Kyle Barnes has been plagued by demonic possession all his life and now he needs answers. Unfortunately, what he uncovers along the way could bring about the end of life on Earth as we know it. Collects OUTCAST BY KIRKMAN & AZACETA #1-6. http://books.toscrape.com/catalogue/outcast-vol-1-a-darkness-surrounds-him-outcast-1_915/index.html\n", + "Mama Tried: Traditional Italian Cooking for the Screwed, Crude, Vegan, and Tattooed Food and Drink 14.02 4 True None Cecilia Granata grew up cooking with her family in Italy. As a vegan, she learned to adapt her favorite recipes from around the country to be animal free while retaining the flavor and feeling of true Italian home cooking. She shares her commitment to ethical and artful eating in this alphabetically-arranged volume with over 100 recipes, ranging from traditional favorites Cecilia Granata grew up cooking with her family in Italy. As a vegan, she learned to adapt her favorite recipes from around the country to be animal free while retaining the flavor and feeling of true Italian home cooking. She shares her commitment to ethical and artful eating in this alphabetically-arranged volume with over 100 recipes, ranging from traditional favorites to homemade liqueurs to aphrodisiacs—all \"senza sofferenza,\" without suffering. The recipes are lushly illustrated with Granata's food-inspired tattoo art. ...more http://books.toscrape.com/catalogue/mama-tried-traditional-italian-cooking-for-the-screwed-crude-vegan-and-tattooed_908/index.html\n" + ] + } + ], "source": [ - "# Your solution goes here" + "import re\n", + "import time\n", + "from urllib.parse import urljoin\n", + "\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "\n", + "BASE = \"http://books.toscrape.com/\"\n", + "CATALOGUE_PAGE = urljoin(BASE, \"catalogue/page-{}.html\")\n", + "\n", + "RATING_MAP = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", + "\n", + "HEADERS = {\n", + " \"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) \"\n", + " \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n", + " \"Chrome/124.0 Safari/537.36\"\n", + "}\n", + "\n", + "\n", + "def get_soup(url, *, retries=3, pause=0.5):\n", + " \"\"\"Fetch a URL and return BeautifulSoup, with simple retries.\"\"\"\n", + " for attempt in range(retries):\n", + " resp = requests.get(url, headers=HEADERS, timeout=15)\n", + " if resp.ok:\n", + " return BeautifulSoup(resp.text, \"html.parser\")\n", + " time.sleep(pause * (attempt + 1))\n", + " resp.raise_for_status() # if still failing, raise\n", + "\n", + "\n", + "def parse_price(text):\n", + " \"\"\"Extract a float from price text like '£51.77' safely.\"\"\"\n", + " # keep digits and dot only\n", + " cleaned = re.sub(r\"[^\\d.]\", \"\", text)\n", + " return float(cleaned) if cleaned else float(\"nan\")\n", + "\n", + "\n", + "def parse_availability(text):\n", + " \"\"\"\n", + " Return (in_stock_bool, count_int_or_None)\n", + " e.g. 'In stock (22 available)' -> (True, 22)\n", + " \"\"\"\n", + " t = \" \".join(text.split()) # collapse whitespace\n", + " in_stock = \"In stock\" in t\n", + " m = re.search(r\"\\((\\d+)\\s+available\\)\", t)\n", + " count = int(m.group(1)) if m else None\n", + " return in_stock, count\n", + "\n", + "\n", + "def get_book_description(book_url):\n", + " \"\"\"\n", + " Visit a product page and return: (category, description)\n", + " Description logic on Books to Scrape:\n", + " - A
followed by a

with text\n", + " - Category is the penultimate breadcrumb item\n", + " \"\"\"\n", + " soup = get_soup(book_url)\n", + " # Category from breadcrumbs\n", + " crumbs = soup.select(\"ul.breadcrumb li a\")\n", + " category = crumbs[-1].get_text(strip=True) if crumbs else None\n", + "\n", + " desc = None\n", + " marker = soup.find(id=\"product_description\")\n", + " if marker:\n", + " p = marker.find_next(\"p\")\n", + " if p:\n", + " desc = p.get_text(strip=True)\n", + " return category, desc\n", + "\n", + "\n", + "def scrape_books(min_rating=4, max_price=20.0, max_pages=None, pause_between_pages=0.15):\n", + " \"\"\"\n", + " Scrape Books to Scrape into a pandas DataFrame, filtered by rating/price.\n", + "\n", + " Args:\n", + " min_rating (int): minimum star rating (1–5)\n", + " max_price (float): maximum price (in pounds)\n", + " max_pages (int|None): cap number of pages (default: auto crawl until 404)\n", + " pause_between_pages (float): polite pause between page fetches\n", + " \"\"\"\n", + " rows = []\n", + " page = 1\n", + "\n", + " while True:\n", + " if max_pages and page > max_pages:\n", + " break\n", + "\n", + " url = CATALOGUE_PAGE.format(page)\n", + " try:\n", + " soup = get_soup(url)\n", + " except requests.HTTPError:\n", + " # page doesn't exist -> done\n", + " break\n", + "\n", + " products = soup.select(\"article.product_pod\")\n", + " if not products:\n", + " break\n", + "\n", + " for prod in products:\n", + " # Title\n", + " a = prod.select_one(\"h3 a\")\n", + " title = a[\"title\"].strip()\n", + "\n", + " # Product relative link -> absolute\n", + " rel = a.get(\"href\", \"\")\n", + " product_url = urljoin(url, rel)\n", + "\n", + " # Price\n", + " price_text = prod.select_one(\"p.price_color\").get_text(strip=True)\n", + " price = parse_price(price_text)\n", + "\n", + " # Rating\n", + " rating_class = prod.select_one(\"p.star-rating\")[\"class\"]\n", + " word = next((c for c in rating_class if c in RATING_MAP), None)\n", + " rating = RATING_MAP.get(word, 0)\n", + "\n", + " # Availability (list page gives 'In stock' text too)\n", + " avail_text = prod.select_one(\"p.instock.availability\").get_text(\" \", strip=True)\n", + " in_stock, stock_count = parse_availability(avail_text)\n", + "\n", + " # Apply filters early\n", + " if rating < min_rating or price > max_price:\n", + " continue\n", + "\n", + " # Visit product page for category + description\n", + " category, description = get_book_description(product_url)\n", + "\n", + " rows.append({\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"InStock\": in_stock,\n", + " \"StockCount\": stock_count,\n", + " \"Category\": category,\n", + " \"Description\": description,\n", + " \"URL\": product_url\n", + " })\n", + "\n", + " page += 1\n", + " time.sleep(pause_between_pages)\n", + "\n", + " df = pd.DataFrame(rows)\n", + " # Optional: stable column order\n", + " if not df.empty:\n", + " df = df[[\n", + " \"Title\", \"Category\", \"Price (£)\", \"Rating\",\n", + " \"InStock\", \"StockCount\", \"Description\", \"URL\"\n", + " ]]\n", + " return df\n", + "\n", + "\n", + "# ---------- Example run ----------\n", + "if __name__ == \"__main__\":\n", + " # Books rated 4★ or 5★, price ≤ £20; crawl all pages\n", + " df = scrape_books(min_rating=4, max_price=20.0)\n", + " print(f\"Found {len(df)} books\")\n", + " print(df.head(10).to_string(index=False))\n" ] } ], @@ -126,7 +300,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +314,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.3" } }, "nbformat": 4,