diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..034da18 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,346 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "id": "edd2ac10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total books found: 1000\n" + ] + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "base_url = 'https://books.toscrape.com/catalogue/page-{}.html'\n", + "book_urls = []\n", + "\n", + "# Step 1: Collect all book URLs\n", + "for page_num in range(1, 51):\n", + " url = base_url.format(page_num)\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " for h3 in soup.find_all('h3'):\n", + " link = h3.find('a')['href']\n", + " abs_link = 'https://books.toscrape.com/catalogue/' + link.replace('../../../', '')\n", + " book_urls.append(abs_link)\n", + "\n", + "print(f'Total books found: {len(book_urls)}')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7844424e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total books scraped: 75...\n", + "First 5 books: [{'title': 'Set Me Free', 'upc': 'ce6396b0f23f6ecc', 'rating': 5, 'price': 17.46, 'availability': 'In stock (19 available)', 'genre': 'Young Adult', 'description': 'Aaron Ledbetter’s future had been planned out for him since before he was born.'}, {'title': 'The Four Agreements: A Practical Guide to Personal Freedom', 'upc': '6258a1f6a6dcfe50', 'rating': 5, 'price': 17.66, 'availability': 'In stock (18 available)', 'genre': 'Spirituality', 'description': 'In The Four Agreements, don Miguel Ruiz reveals the source of self-limiting beliefs that rob us of joy and create needless suffering.'}, {'title': \"Sophie's World\", 'upc': '6be3beb0793a53e7', 'rating': 5, 'price': 15.94, 'availability': 'In stock (18 available)', 'genre': 'Philosophy', 'description': 'A page-turning novel that is also an exploration of the great philosophical concepts of Western thought, Sophie’s World has fired the imagination of readers all over the world, with more than twenty million copies in print.'}, {'title': 'Untitled Collection: Sabbath Poems 2014', 'upc': '657fe5ead67a7767', 'rating': 4, 'price': 14.27, 'availability': 'In stock (16 available)', 'genre': 'Poetry', 'description': 'More than thirty-five years ago, when the weather allowed, Wendell Berry began spending his sabbaths outdoors, walking and wandering around familiar territory, seeking a deep intimacy only time could provide.'}, {'title': 'This One Summer', 'upc': '51653ef291ab7ddc', 'rating': 4, 'price': 19.49, 'availability': 'In stock (16 available)', 'genre': 'Sequential Art', 'description': 'Every summer, Rose goes with her mom and dad to a lake house in Awago Beach.'}]\n" + ] + } + ], + "source": [ + "# Step 2: Visit each book page and get Title & UPC\n", + "rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "\n", + "books_dc = []\n", + "\n", + "for i, url in enumerate(book_urls):\n", + " try:\n", + " response = requests.get(url)\n", + " response.encoding = 'utf-8'\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " books = {}\n", + "\n", + " \n", + " books['title'] = soup.find('div', class_='product_main').find('h1').text.strip()\n", + " books['upc'] = soup.find('th', string='UPC').find_next('td').text.strip()\n", + " # Rating (text to number)\n", + " star_tag = soup.find('p', class_='star-rating')\n", + " rating_txt = None\n", + " if star_tag:\n", + " for class_name in star_tag['class']:\n", + " if class_name != 'star-rating':\n", + " rating_txt = class_name\n", + " rating_num = rating_map.get(rating_txt, None)\n", + " books['rating'] = rating_num\n", + " # Price (parse to float, remove pound sign)\n", + " price_str = soup.find(class_=\"price_color\").text.replace(\"Â\", \"\").strip()\n", + " price_val = float(price_str.lstrip('£'))\n", + " books['price'] = price_val\n", + " books['availability'] = soup.find(class_='instock availability').text.strip()\n", + " # Genre\n", + " breadcrumb_links = soup.select('ul.breadcrumb li a')\n", + " genre = breadcrumb_links[-1].text.strip() if len(breadcrumb_links) >= 3 else None\n", + " books['genre'] = genre\n", + " # Description (only up to first dot)\n", + " desc_heading = soup.find('div', id='product_description')\n", + " if desc_heading:\n", + " desc_tag = desc_heading.find_next_sibling('p')\n", + " if desc_tag:\n", + " full_text = desc_tag.text.strip()\n", + " first_sentence = full_text.split('.', 1)[0].strip() + '.' if '.' in full_text else full_text\n", + " else:\n", + " first_sentence = None\n", + " else:\n", + " first_sentence = None\n", + " books['description'] = first_sentence\n", + " \n", + "\n", + " # Apply filters\n", + " min_rating = 4 or 5\n", + " max_price = 20\n", + " if rating_num is not None and rating_num >= min_rating and price_val <= max_price:\n", + " books_dc.append(books)\n", + " \n", + " # Optional: show progress\n", + " if (i+1) % 50 == 0:\n", + " print(f\"Scraped {i+1}/{len(book_urls)} books...\", end='\\r')\n", + " time.sleep(0.05)\n", + " except Exception as e:\n", + " print(f'Failed for {url}: {e}')\n", + "\n", + "print(f\"Total books scraped: {len(books_dc)}\")\n", + "print(\"First 5 books:\", books_dc[:5])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import pandas as pd\n", + "\n", + "\n", + "df = pd.DataFrame(books_dc)" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c7f54c1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | title | \n", + "upc | \n", + "rating | \n", + "price | \n", + "availability | \n", + "genre | \n", + "description | \n", + "
---|---|---|---|---|---|---|---|
0 | \n", + "Set Me Free | \n", + "ce6396b0f23f6ecc | \n", + "5 | \n", + "17.46 | \n", + "In stock (19 available) | \n", + "Young Adult | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "
1 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "6258a1f6a6dcfe50 | \n", + "5 | \n", + "17.66 | \n", + "In stock (18 available) | \n", + "Spirituality | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
2 | \n", + "Sophie's World | \n", + "6be3beb0793a53e7 | \n", + "5 | \n", + "15.94 | \n", + "In stock (18 available) | \n", + "Philosophy | \n", + "A page-turning novel that is also an explorati... | \n", + "
3 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "657fe5ead67a7767 | \n", + "4 | \n", + "14.27 | \n", + "In stock (16 available) | \n", + "Poetry | \n", + "More than thirty-five years ago, when the weat... | \n", + "
4 | \n", + "This One Summer | \n", + "51653ef291ab7ddc | \n", + "4 | \n", + "19.49 | \n", + "In stock (16 available) | \n", + "Sequential Art | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
70 | \n", + "The Zombie Room | \n", + "9c96cd1329fbd82d | \n", + "5 | \n", + "19.69 | \n", + "In stock (1 available) | \n", + "Default | \n", + "An unlikely bond is forged between three men f... | \n", + "
71 | \n", + "The Silent Wife | \n", + "b78deb463531d078 | \n", + "5 | \n", + "12.34 | \n", + "In stock (1 available) | \n", + "Fiction | \n", + "A chilling psychological thriller about a marr... | \n", + "
72 | \n", + "The Girl You Lost | \n", + "4280ac3eab57aa5d | \n", + "5 | \n", + "12.29 | \n", + "In stock (1 available) | \n", + "Mystery | \n", + "Eighteen years ago your baby daughter was snat... | \n", + "
73 | \n", + "The Edge of Reason (Bridget Jones #2) | \n", + "29fc016c459aeb14 | \n", + "4 | \n", + "19.18 | \n", + "In stock (1 available) | \n", + "Womens Fiction | \n", + "Monday 27 January“7:15 a. | \n", + "
74 | \n", + "A Spy's Devotion (The Regency Spies of London #1) | \n", + "19fec36a1dfb4c16 | \n", + "5 | \n", + "16.97 | \n", + "In stock (1 available) | \n", + "Historical Fiction | \n", + "In England’s Regency era, manners and elegance... | \n", + "
75 rows × 7 columns
\n", + "