Skip to content
Open

done #350

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 335 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,354 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "edd2ac10",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total books found: 1000\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"\n",
"base_url = 'https://books.toscrape.com/catalogue/page-{}.html'\n",
"book_urls = []\n",
"\n",
"# Step 1: Collect all book URLs\n",
"for page_num in range(1, 51):\n",
" url = base_url.format(page_num)\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" for h3 in soup.find_all('h3'):\n",
" link = h3.find('a')['href']\n",
" abs_link = 'https://books.toscrape.com/catalogue/' + link.replace('../../../', '')\n",
" book_urls.append(abs_link)\n",
"\n",
"print(f'Total books found: {len(book_urls)}')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7844424e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total books scraped: 75...\n",
"First 5 books: [{'title': 'Set Me Free', 'upc': 'ce6396b0f23f6ecc', 'rating': 5, 'price': 17.46, 'availability': 'In stock (19 available)', 'genre': 'Young Adult', 'description': 'Aaron Ledbetter’s future had been planned out for him since before he was born.'}, {'title': 'The Four Agreements: A Practical Guide to Personal Freedom', 'upc': '6258a1f6a6dcfe50', 'rating': 5, 'price': 17.66, 'availability': 'In stock (18 available)', 'genre': 'Spirituality', 'description': 'In The Four Agreements, don Miguel Ruiz reveals the source of self-limiting beliefs that rob us of joy and create needless suffering.'}, {'title': \"Sophie's World\", 'upc': '6be3beb0793a53e7', 'rating': 5, 'price': 15.94, 'availability': 'In stock (18 available)', 'genre': 'Philosophy', 'description': 'A page-turning novel that is also an exploration of the great philosophical concepts of Western thought, Sophie’s World has fired the imagination of readers all over the world, with more than twenty million copies in print.'}, {'title': 'Untitled Collection: Sabbath Poems 2014', 'upc': '657fe5ead67a7767', 'rating': 4, 'price': 14.27, 'availability': 'In stock (16 available)', 'genre': 'Poetry', 'description': 'More than thirty-five years ago, when the weather allowed, Wendell Berry began spending his sabbaths outdoors, walking and wandering around familiar territory, seeking a deep intimacy only time could provide.'}, {'title': 'This One Summer', 'upc': '51653ef291ab7ddc', 'rating': 4, 'price': 19.49, 'availability': 'In stock (16 available)', 'genre': 'Sequential Art', 'description': 'Every summer, Rose goes with her mom and dad to a lake house in Awago Beach.'}]\n"
]
}
],
"source": [
"# Step 2: Visit each book page and get Title & UPC\n",
"rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
"\n",
"books_dc = []\n",
"\n",
"for i, url in enumerate(book_urls):\n",
" try:\n",
" response = requests.get(url)\n",
" response.encoding = 'utf-8'\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" books = {}\n",
"\n",
" \n",
" books['title'] = soup.find('div', class_='product_main').find('h1').text.strip()\n",
" books['upc'] = soup.find('th', string='UPC').find_next('td').text.strip()\n",
" # Rating (text to number)\n",
" star_tag = soup.find('p', class_='star-rating')\n",
" rating_txt = None\n",
" if star_tag:\n",
" for class_name in star_tag['class']:\n",
" if class_name != 'star-rating':\n",
" rating_txt = class_name\n",
" rating_num = rating_map.get(rating_txt, None)\n",
" books['rating'] = rating_num\n",
" # Price (parse to float, remove pound sign)\n",
" price_str = soup.find(class_=\"price_color\").text.replace(\"Â\", \"\").strip()\n",
" price_val = float(price_str.lstrip('£'))\n",
" books['price'] = price_val\n",
" books['availability'] = soup.find(class_='instock availability').text.strip()\n",
" # Genre\n",
" breadcrumb_links = soup.select('ul.breadcrumb li a')\n",
" genre = breadcrumb_links[-1].text.strip() if len(breadcrumb_links) >= 3 else None\n",
" books['genre'] = genre\n",
" # Description (only up to first dot)\n",
" desc_heading = soup.find('div', id='product_description')\n",
" if desc_heading:\n",
" desc_tag = desc_heading.find_next_sibling('p')\n",
" if desc_tag:\n",
" full_text = desc_tag.text.strip()\n",
" first_sentence = full_text.split('.', 1)[0].strip() + '.' if '.' in full_text else full_text\n",
" else:\n",
" first_sentence = None\n",
" else:\n",
" first_sentence = None\n",
" books['description'] = first_sentence\n",
" \n",
"\n",
" # Apply filters\n",
" min_rating = 4 or 5\n",
" max_price = 20\n",
" if rating_num is not None and rating_num >= min_rating and price_val <= max_price:\n",
" books_dc.append(books)\n",
" \n",
" # Optional: show progress\n",
" if (i+1) % 50 == 0:\n",
" print(f\"Scraped {i+1}/{len(book_urls)} books...\", end='\\r')\n",
" time.sleep(0.05)\n",
" except Exception as e:\n",
" print(f'Failed for {url}: {e}')\n",
"\n",
"print(f\"Total books scraped: {len(books_dc)}\")\n",
"print(\"First 5 books:\", books_dc[:5])\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"source": [
"# Your solution goes here"
"import pandas as pd\n",
"\n",
"\n",
"df = pd.DataFrame(books_dc)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c7f54c1e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>upc</th>\n",
" <th>rating</th>\n",
" <th>price</th>\n",
" <th>availability</th>\n",
" <th>genre</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Set Me Free</td>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>5</td>\n",
" <td>17.46</td>\n",
" <td>In stock (19 available)</td>\n",
" <td>Young Adult</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>The Four Agreements: A Practical Guide to Pers...</td>\n",
" <td>6258a1f6a6dcfe50</td>\n",
" <td>5</td>\n",
" <td>17.66</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>Spirituality</td>\n",
" <td>In The Four Agreements, don Miguel Ruiz reveal...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sophie's World</td>\n",
" <td>6be3beb0793a53e7</td>\n",
" <td>5</td>\n",
" <td>15.94</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>Philosophy</td>\n",
" <td>A page-turning novel that is also an explorati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Untitled Collection: Sabbath Poems 2014</td>\n",
" <td>657fe5ead67a7767</td>\n",
" <td>4</td>\n",
" <td>14.27</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>Poetry</td>\n",
" <td>More than thirty-five years ago, when the weat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>This One Summer</td>\n",
" <td>51653ef291ab7ddc</td>\n",
" <td>4</td>\n",
" <td>19.49</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>Sequential Art</td>\n",
" <td>Every summer, Rose goes with her mom and dad t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>The Zombie Room</td>\n",
" <td>9c96cd1329fbd82d</td>\n",
" <td>5</td>\n",
" <td>19.69</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Default</td>\n",
" <td>An unlikely bond is forged between three men f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>The Silent Wife</td>\n",
" <td>b78deb463531d078</td>\n",
" <td>5</td>\n",
" <td>12.34</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Fiction</td>\n",
" <td>A chilling psychological thriller about a marr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>The Girl You Lost</td>\n",
" <td>4280ac3eab57aa5d</td>\n",
" <td>5</td>\n",
" <td>12.29</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Mystery</td>\n",
" <td>Eighteen years ago your baby daughter was snat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>The Edge of Reason (Bridget Jones #2)</td>\n",
" <td>29fc016c459aeb14</td>\n",
" <td>4</td>\n",
" <td>19.18</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Womens Fiction</td>\n",
" <td>Monday 27 January“7:15 a.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>A Spy's Devotion (The Regency Spies of London #1)</td>\n",
" <td>19fec36a1dfb4c16</td>\n",
" <td>5</td>\n",
" <td>16.97</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Historical Fiction</td>\n",
" <td>In England’s Regency era, manners and elegance...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>75 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" title upc \\\n",
"0 Set Me Free ce6396b0f23f6ecc \n",
"1 The Four Agreements: A Practical Guide to Pers... 6258a1f6a6dcfe50 \n",
"2 Sophie's World 6be3beb0793a53e7 \n",
"3 Untitled Collection: Sabbath Poems 2014 657fe5ead67a7767 \n",
"4 This One Summer 51653ef291ab7ddc \n",
".. ... ... \n",
"70 The Zombie Room 9c96cd1329fbd82d \n",
"71 The Silent Wife b78deb463531d078 \n",
"72 The Girl You Lost 4280ac3eab57aa5d \n",
"73 The Edge of Reason (Bridget Jones #2) 29fc016c459aeb14 \n",
"74 A Spy's Devotion (The Regency Spies of London #1) 19fec36a1dfb4c16 \n",
"\n",
" rating price availability genre \\\n",
"0 5 17.46 In stock (19 available) Young Adult \n",
"1 5 17.66 In stock (18 available) Spirituality \n",
"2 5 15.94 In stock (18 available) Philosophy \n",
"3 4 14.27 In stock (16 available) Poetry \n",
"4 4 19.49 In stock (16 available) Sequential Art \n",
".. ... ... ... ... \n",
"70 5 19.69 In stock (1 available) Default \n",
"71 5 12.34 In stock (1 available) Fiction \n",
"72 5 12.29 In stock (1 available) Mystery \n",
"73 4 19.18 In stock (1 available) Womens Fiction \n",
"74 5 16.97 In stock (1 available) Historical Fiction \n",
"\n",
" description \n",
"0 Aaron Ledbetter’s future had been planned out ... \n",
"1 In The Four Agreements, don Miguel Ruiz reveal... \n",
"2 A page-turning novel that is also an explorati... \n",
"3 More than thirty-five years ago, when the weat... \n",
"4 Every summer, Rose goes with her mom and dad t... \n",
".. ... \n",
"70 An unlikely bond is forged between three men f... \n",
"71 A chilling psychological thriller about a marr... \n",
"72 Eighteen years ago your baby daughter was snat... \n",
"73 Monday 27 January“7:15 a. \n",
"74 In England’s Regency era, manners and elegance... \n",
"\n",
"[75 rows x 7 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1ad0f0e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +471,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down