diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..034da18 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,346 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "id": "edd2ac10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total books found: 1000\n" + ] + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "base_url = 'https://books.toscrape.com/catalogue/page-{}.html'\n", + "book_urls = []\n", + "\n", + "# Step 1: Collect all book URLs\n", + "for page_num in range(1, 51):\n", + " url = base_url.format(page_num)\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " for h3 in soup.find_all('h3'):\n", + " link = h3.find('a')['href']\n", + " abs_link = 'https://books.toscrape.com/catalogue/' + link.replace('../../../', '')\n", + " book_urls.append(abs_link)\n", + "\n", + "print(f'Total books found: {len(book_urls)}')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7844424e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total books scraped: 75...\n", + "First 5 books: [{'title': 'Set Me Free', 'upc': 'ce6396b0f23f6ecc', 'rating': 5, 'price': 17.46, 'availability': 'In stock (19 available)', 'genre': 'Young Adult', 'description': 'Aaron Ledbetter’s future had been planned out for him since before he was born.'}, {'title': 'The Four Agreements: A Practical Guide to Personal Freedom', 'upc': '6258a1f6a6dcfe50', 'rating': 5, 'price': 17.66, 'availability': 'In stock (18 available)', 'genre': 'Spirituality', 'description': 'In The Four Agreements, don Miguel Ruiz reveals the source of self-limiting beliefs that rob us of joy and create needless suffering.'}, {'title': \"Sophie's World\", 'upc': '6be3beb0793a53e7', 'rating': 5, 'price': 15.94, 'availability': 'In stock (18 available)', 'genre': 'Philosophy', 'description': 'A page-turning novel that is also an exploration of the great philosophical concepts of Western thought, Sophie’s World has fired the imagination of readers all over the world, with more than twenty million copies in print.'}, {'title': 'Untitled Collection: Sabbath Poems 2014', 'upc': '657fe5ead67a7767', 'rating': 4, 'price': 14.27, 'availability': 'In stock (16 available)', 'genre': 'Poetry', 'description': 'More than thirty-five years ago, when the weather allowed, Wendell Berry began spending his sabbaths outdoors, walking and wandering around familiar territory, seeking a deep intimacy only time could provide.'}, {'title': 'This One Summer', 'upc': '51653ef291ab7ddc', 'rating': 4, 'price': 19.49, 'availability': 'In stock (16 available)', 'genre': 'Sequential Art', 'description': 'Every summer, Rose goes with her mom and dad to a lake house in Awago Beach.'}]\n" + ] + } + ], + "source": [ + "# Step 2: Visit each book page and get Title & UPC\n", + "rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "\n", + "books_dc = []\n", + "\n", + "for i, url in enumerate(book_urls):\n", + " try:\n", + " response = requests.get(url)\n", + " response.encoding = 'utf-8'\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " books = {}\n", + "\n", + " \n", + " books['title'] = soup.find('div', class_='product_main').find('h1').text.strip()\n", + " books['upc'] = soup.find('th', string='UPC').find_next('td').text.strip()\n", + " # Rating (text to number)\n", + " star_tag = soup.find('p', class_='star-rating')\n", + " rating_txt = None\n", + " if star_tag:\n", + " for class_name in star_tag['class']:\n", + " if class_name != 'star-rating':\n", + " rating_txt = class_name\n", + " rating_num = rating_map.get(rating_txt, None)\n", + " books['rating'] = rating_num\n", + " # Price (parse to float, remove pound sign)\n", + " price_str = soup.find(class_=\"price_color\").text.replace(\"Â\", \"\").strip()\n", + " price_val = float(price_str.lstrip('£'))\n", + " books['price'] = price_val\n", + " books['availability'] = soup.find(class_='instock availability').text.strip()\n", + " # Genre\n", + " breadcrumb_links = soup.select('ul.breadcrumb li a')\n", + " genre = breadcrumb_links[-1].text.strip() if len(breadcrumb_links) >= 3 else None\n", + " books['genre'] = genre\n", + " # Description (only up to first dot)\n", + " desc_heading = soup.find('div', id='product_description')\n", + " if desc_heading:\n", + " desc_tag = desc_heading.find_next_sibling('p')\n", + " if desc_tag:\n", + " full_text = desc_tag.text.strip()\n", + " first_sentence = full_text.split('.', 1)[0].strip() + '.' if '.' in full_text else full_text\n", + " else:\n", + " first_sentence = None\n", + " else:\n", + " first_sentence = None\n", + " books['description'] = first_sentence\n", + " \n", + "\n", + " # Apply filters\n", + " min_rating = 4 or 5\n", + " max_price = 20\n", + " if rating_num is not None and rating_num >= min_rating and price_val <= max_price:\n", + " books_dc.append(books)\n", + " \n", + " # Optional: show progress\n", + " if (i+1) % 50 == 0:\n", + " print(f\"Scraped {i+1}/{len(book_urls)} books...\", end='\\r')\n", + " time.sleep(0.05)\n", + " except Exception as e:\n", + " print(f'Failed for {url}: {e}')\n", + "\n", + "print(f\"Total books scraped: {len(books_dc)}\")\n", + "print(\"First 5 books:\", books_dc[:5])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import pandas as pd\n", + "\n", + "\n", + "df = pd.DataFrame(books_dc)" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c7f54c1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleupcratingpriceavailabilitygenredescription
0Set Me Freece6396b0f23f6ecc517.46In stock (19 available)Young AdultAaron Ledbetter’s future had been planned out ...
1The Four Agreements: A Practical Guide to Pers...6258a1f6a6dcfe50517.66In stock (18 available)SpiritualityIn The Four Agreements, don Miguel Ruiz reveal...
2Sophie's World6be3beb0793a53e7515.94In stock (18 available)PhilosophyA page-turning novel that is also an explorati...
3Untitled Collection: Sabbath Poems 2014657fe5ead67a7767414.27In stock (16 available)PoetryMore than thirty-five years ago, when the weat...
4This One Summer51653ef291ab7ddc419.49In stock (16 available)Sequential ArtEvery summer, Rose goes with her mom and dad t...
........................
70The Zombie Room9c96cd1329fbd82d519.69In stock (1 available)DefaultAn unlikely bond is forged between three men f...
71The Silent Wifeb78deb463531d078512.34In stock (1 available)FictionA chilling psychological thriller about a marr...
72The Girl You Lost4280ac3eab57aa5d512.29In stock (1 available)MysteryEighteen years ago your baby daughter was snat...
73The Edge of Reason (Bridget Jones #2)29fc016c459aeb14419.18In stock (1 available)Womens FictionMonday 27 January“7:15 a.
74A Spy's Devotion (The Regency Spies of London #1)19fec36a1dfb4c16516.97In stock (1 available)Historical FictionIn England’s Regency era, manners and elegance...
\n", + "

75 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " title upc \\\n", + "0 Set Me Free ce6396b0f23f6ecc \n", + "1 The Four Agreements: A Practical Guide to Pers... 6258a1f6a6dcfe50 \n", + "2 Sophie's World 6be3beb0793a53e7 \n", + "3 Untitled Collection: Sabbath Poems 2014 657fe5ead67a7767 \n", + "4 This One Summer 51653ef291ab7ddc \n", + ".. ... ... \n", + "70 The Zombie Room 9c96cd1329fbd82d \n", + "71 The Silent Wife b78deb463531d078 \n", + "72 The Girl You Lost 4280ac3eab57aa5d \n", + "73 The Edge of Reason (Bridget Jones #2) 29fc016c459aeb14 \n", + "74 A Spy's Devotion (The Regency Spies of London #1) 19fec36a1dfb4c16 \n", + "\n", + " rating price availability genre \\\n", + "0 5 17.46 In stock (19 available) Young Adult \n", + "1 5 17.66 In stock (18 available) Spirituality \n", + "2 5 15.94 In stock (18 available) Philosophy \n", + "3 4 14.27 In stock (16 available) Poetry \n", + "4 4 19.49 In stock (16 available) Sequential Art \n", + ".. ... ... ... ... \n", + "70 5 19.69 In stock (1 available) Default \n", + "71 5 12.34 In stock (1 available) Fiction \n", + "72 5 12.29 In stock (1 available) Mystery \n", + "73 4 19.18 In stock (1 available) Womens Fiction \n", + "74 5 16.97 In stock (1 available) Historical Fiction \n", + "\n", + " description \n", + "0 Aaron Ledbetter’s future had been planned out ... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n", + ".. ... \n", + "70 An unlikely bond is forged between three men f... \n", + "71 A chilling psychological thriller about a marr... \n", + "72 Eighteen years ago your baby daughter was snat... \n", + "73 Monday 27 January“7:15 a. \n", + "74 In England’s Regency era, manners and elegance... \n", + "\n", + "[75 rows x 7 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1ad0f0e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -126,7 +457,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +471,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,