Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 173 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,183 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"source": [
"# Your solution goes here"
"import pandas as pd\n",
"import requests\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "593df2f0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Response [200]>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = \"https://books.toscrape.com/catalogue/page-1.html\"\n",
"response = requests.get(url)\n",
"response"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5b98f789",
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup(response.content)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "636400c6",
"metadata": {},
"outputs": [],
"source": [
"books_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f022b55f",
"metadata": {},
"outputs": [],
"source": [
"books = books_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b033b71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" UPC Title Genre Availability Description Price (£) Rating\n",
"0 N/A 404 Not Found Unknown N/A No description 17.46 5\n",
"1 N/A 404 Not Found Unknown N/A No description 17.66 5\n",
"2 N/A 404 Not Found Unknown N/A No description 15.94 5\n",
"3 N/A 404 Not Found Unknown N/A No description 14.27 4\n",
"4 N/A 404 Not Found Unknown N/A No description 19.49 4\n",
"\n",
"Scraping complete. Results saved to 'filtered_books.csv'.\n"
]
}
],
"source": [
"BASE_URL = \"http://books.toscrape.com/\"\n",
"\n",
"# Map star-rating class names to numeric values\n",
"RATING_MAP = {\n",
" \"One\": 1,\n",
" \"Two\": 2,\n",
" \"Three\": 3,\n",
" \"Four\": 4,\n",
" \"Five\": 5\n",
"}\n",
"\n",
"def get_full_url(relative_url):\n",
" return BASE_URL + relative_url.replace(\"../\", \"\").replace(\"catalogue/\", \"\")\n",
"\n",
"def scrape_book_detail(url):\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
" title = soup.h1.text.strip()\n",
"\n",
" description_tag = soup.select_one(\"#product_description ~ p\")\n",
" description = description_tag.text.strip() if description_tag else \"No description\"\n",
"\n",
" table = soup.select(\"table.table.table-striped tr\")\n",
" upc = table[0].td.text.strip() if len(table) > 0 else \"N/A\"\n",
" availability = table[5].td.text.strip() if len(table) > 5 else \"N/A\"\n",
"\n",
" breadcrumb = soup.select(\"ul.breadcrumb li a\")\n",
" genre = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else \"Unknown\"\n",
"\n",
" return {\n",
" \"UPC\": upc,\n",
" \"Title\": title,\n",
" \"Genre\": genre,\n",
" \"Availability\": availability,\n",
" \"Description\": description\n",
" }\n",
"\n",
"def scrape_books(min_rating=4, max_price=20):\n",
" \"\"\"Scrape books from the site matching rating and price filters.\"\"\"\n",
" all_books = []\n",
" page_url = \"catalogue/page-1.html\"\n",
"\n",
" while page_url:\n",
" response = requests.get(BASE_URL + page_url)\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
" book_items = soup.select(\"article.product_pod\")\n",
"\n",
" for book in book_items:\n",
" # Extract rating\n",
" rating_class = book.p['class'][1]\n",
" rating = RATING_MAP.get(rating_class, 0)\n",
"\n",
" # Extract price\n",
" price_text = book.select_one(\"p.price_color\").text.strip().replace(\"£\", \"\")\n",
" price = float(price_text)\n",
"\n",
" if rating >= min_rating and price <= max_price:\n",
" # Link to book detail page\n",
" detail_href = book.h3.a['href']\n",
" detail_url = get_full_url(detail_href)\n",
"\n",
" # Get detailed book data\n",
" book_data = scrape_book_detail(detail_url)\n",
"\n",
" # Add price and rating to book_data\n",
" book_data.update({\n",
" \"Price (£)\": price,\n",
" \"Rating\": rating\n",
" })\n",
"\n",
" all_books.append(book_data)\n",
"\n",
" # Go to next page if available\n",
" next_button = soup.select_one(\"li.next > a\")\n",
" if next_button:\n",
" next_href = next_button['href']\n",
" page_url = \"catalogue/\" + next_href\n",
" else:\n",
" page_url = None\n",
"\n",
"\n",
" return pd.DataFrame(all_books)\n",
"\n",
"if __name__ == \"__main__\":\n",
" # Example: get books with rating >= 4 and price <= 20\n",
" df = scrape_books(min_rating=4, max_price=20)\n",
" print(df.head())\n",
" df.to_csv(\"filtered_books.csv\", index=False)\n",
" print(\"\\nScraping complete. Results saved to 'filtered_books.csv'.\")\n"
]
}
],
Expand All @@ -126,7 +295,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +309,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down