Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 186 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,196 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [
{
"data": {
"text/plain": [
"<Response [200]>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import necessary libraries\n",
"import pandas as pd\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"url = \"https://books.toscrape.com/\"\n",
"response = requests.get(url)\n",
"response.encoding = 'utf-8' # Force correct encoding\n",
"soup = BeautifulSoup(response.text, \"html.parser\")\n",
"response\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b99003f1",
"metadata": {},
"outputs": [],
"source": [
"# Your solution goes here"
"# Set filters for data extraction\n",
"min_rating = 4\n",
"max_price = 20.0"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f090dad6",
"metadata": {},
"outputs": [],
"source": [
"# Helper functions\n",
"\n",
"# Convert rating in string format to number\n",
"rating_to_int = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
"\n",
"def parse_price(p_text: str) -> float: # Convert price string '£51.77' to float\n",
" # Remove all non-digit and non-dot characters\n",
" clean_text = ''.join(c for c in p_text if c.isdigit() or c == '.')\n",
" return float(clean_text)\n",
"\n",
"def parse_rating(article_tag) -> int: # Extract numeric rating from article tag\n",
" p = article_tag.find('p', class_='star-rating')\n",
" for cls in p.get('class', []):\n",
" if cls in rating_to_int:\n",
" return rating_to_int[cls]\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "62da0a04",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\cpall\\AppData\\Local\\Temp\\ipykernel_18124\\3419929751.py:51: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n",
" upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" UPC Title \\\n",
"0 ce6396b0f23f6ecc Set Me Free \n",
"1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... \n",
"2 6be3beb0793a53e7 Sophie's World \n",
"3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 \n",
"4 51653ef291ab7ddc This One Summer \n",
"\n",
" Price (£) Rating Genre Availability \\\n",
"0 17.46 5 Young Adult In stock (19 available) \n",
"1 17.66 5 Spirituality In stock (18 available) \n",
"2 15.94 5 Philosophy In stock (18 available) \n",
"3 14.27 4 Poetry In stock (16 available) \n",
"4 19.49 4 Sequential Art In stock (16 available) \n",
"\n",
" Description \n",
"0 Aaron Ledbetter’s future had been planned out ... \n",
"1 In The Four Agreements, don Miguel Ruiz reveal... \n",
"2 A page-turning novel that is also an explorati... \n",
"3 More than thirty-five years ago, when the weat... \n",
"4 Every summer, Rose goes with her mom and dad t... \n",
"Total books found: 75\n"
]
}
],
"source": [
"# Scraping function scrapes books to Scrape and return books\n",
"# with rating >= min_rating and price <= max_price, returns a pandas DataFrame.\n",
"\n",
"def scrape_books(min_rating=4, max_price=20.0):\n",
" base_url = \"http://books.toscrape.com/catalogue/\"\n",
" page_num = 1\n",
" books_data = []\n",
"\n",
" while True:\n",
" # Construct page URL\n",
" url = f\"{base_url}page-{page_num}.html\"\n",
" response = requests.get(url)\n",
" \n",
" # Stop if page does not exist\n",
" if response.status_code != 200:\n",
" break\n",
" \n",
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
" books = soup.find_all(\"article\", class_=\"product_pod\")\n",
" \n",
" for book in books:\n",
" # Extract basic info\n",
" title = book.h3.a[\"title\"]\n",
" price = parse_price(book.find(\"p\", class_=\"price_color\").text)\n",
" rating = parse_rating(book)\n",
"\n",
" # Filter by rating and price\n",
" if rating >= min_rating and price <= max_price:\n",
" # Go to detail page for extra info\n",
" detail_link = book.h3.a[\"href\"]\n",
" # Adjust relative URL\n",
" detail_url = base_url + detail_link.replace('../../../', '')\n",
" detail_resp = requests.get(detail_url)\n",
" detail_resp.encoding = 'utf-8'\n",
" detail_soup = BeautifulSoup(detail_resp.text, \"html.parser\")\n",
"\n",
" # Description\n",
" description_tag = detail_soup.find(\"div\", id=\"product_description\")\n",
" description = \"\"\n",
" if description_tag:\n",
" description = description_tag.find_next_sibling(\"p\").text\n",
"\n",
" # Genre from breadcrumb\n",
" breadcrumb = detail_soup.find(\"ul\", class_=\"breadcrumb\")\n",
" genre = breadcrumb.find_all(\"li\")[2].text.strip()\n",
"\n",
" # Availability\n",
" availability = detail_soup.find(\"p\", class_=\"instock availability\").text.strip()\n",
"\n",
" # UPC\n",
" upc_tag = detail_soup.find(\"th\", text=\"UPC\")\n",
" upc = upc_tag.find_next_sibling(\"td\").text.strip() if upc_tag else \"\"\n",
"\n",
" # Store data\n",
" books_data.append({\n",
" \"UPC\": upc,\n",
" \"Title\": title,\n",
" \"Price (£)\": price,\n",
" \"Rating\": rating,\n",
" \"Genre\": genre,\n",
" \"Availability\": availability,\n",
" \"Description\": description\n",
" })\n",
"\n",
" page_num += 1 # Move to next page\n",
"\n",
" # Convert to DataFrame\n",
" df = pd.DataFrame(books_data)\n",
" return df\n",
"\n",
"# Example Usage\n",
"if __name__ == \"__main__\":\n",
" df_books = scrape_books(min_rating=4, max_price=20.0)\n",
"\n",
" # Show first few results\n",
" print(df_books.head()) \n",
" \n",
" # Show total number of books found\n",
" print(f\"Total books found: {len(df_books)}\")"
]
}
],
Expand All @@ -126,7 +308,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +322,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down