Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2.6_web_scraping-3.ipynb
Binary file added __pycache__/scraping_utils.cpython-313.pyc
Binary file not shown.
158 changes: 153 additions & 5 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,162 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>genre</th>\n",
" <th>UPC</th>\n",
" <th>Price</th>\n",
" <th>Availability</th>\n",
" <th>rating</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Sharp Objects</td>\n",
" <td>Mystery</td>\n",
" <td>e00eb4fd7b871a48</td>\n",
" <td>47.82</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>WICKED above her hipbone, GIRL across her hear...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>The Dirty Little Secrets of Getting Your Dream...</td>\n",
" <td>Business</td>\n",
" <td>2597b5a345f45e1b</td>\n",
" <td>33.34</td>\n",
" <td>19</td>\n",
" <td>4</td>\n",
" <td>Drawing on his extensive experience evaluating...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Coming Woman: A Novel Based on the Life of...</td>\n",
" <td>Default</td>\n",
" <td>e72a5dfc7e9267b2</td>\n",
" <td>17.93</td>\n",
" <td>19</td>\n",
" <td>3</td>\n",
" <td>\"If you have a heart, if you have a soul, Kare...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>The Boys in the Boat: Nine Americans and Their...</td>\n",
" <td>Default</td>\n",
" <td>e10e1e165dc8be4a</td>\n",
" <td>22.60</td>\n",
" <td>19</td>\n",
" <td>4</td>\n",
" <td>For readers of Laura Hillenbrand's Seabiscuit ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Shakespeare's Sonnets</td>\n",
" <td>Poetry</td>\n",
" <td>30a7f60cd76ca58c</td>\n",
" <td>20.66</td>\n",
" <td>19</td>\n",
" <td>4</td>\n",
" <td>This book is an important and complete collect...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Set Me Free</td>\n",
" <td>Young Adult</td>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>17.46</td>\n",
" <td>19</td>\n",
" <td>5</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Rip it Up and Start Again</td>\n",
" <td>Music</td>\n",
" <td>a34ba96d4081e6a4</td>\n",
" <td>35.02</td>\n",
" <td>19</td>\n",
" <td>5</td>\n",
" <td>Punk's raw power rejuvenated rock, but by the ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title genre \\\n",
"0 Sharp Objects Mystery \n",
"1 The Dirty Little Secrets of Getting Your Dream... Business \n",
"2 The Coming Woman: A Novel Based on the Life of... Default \n",
"3 The Boys in the Boat: Nine Americans and Their... Default \n",
"4 Shakespeare's Sonnets Poetry \n",
"5 Set Me Free Young Adult \n",
"6 Rip it Up and Start Again Music \n",
"\n",
" UPC Price Availability rating \\\n",
"0 e00eb4fd7b871a48 47.82 20 4 \n",
"1 2597b5a345f45e1b 33.34 19 4 \n",
"2 e72a5dfc7e9267b2 17.93 19 3 \n",
"3 e10e1e165dc8be4a 22.60 19 4 \n",
"4 30a7f60cd76ca58c 20.66 19 4 \n",
"5 ce6396b0f23f6ecc 17.46 19 5 \n",
"6 a34ba96d4081e6a4 35.02 19 5 \n",
"\n",
" description \n",
"0 WICKED above her hipbone, GIRL across her hear... \n",
"1 Drawing on his extensive experience evaluating... \n",
"2 \"If you have a heart, if you have a soul, Kare... \n",
"3 For readers of Laura Hillenbrand's Seabiscuit ... \n",
"4 This book is an important and complete collect... \n",
"5 Aaron Ledbetter’s future had been planned out ... \n",
"6 Punk's raw power rejuvenated rock, but by the ... "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your solution goes here"
"import scraping_utils as sr\n",
"\n",
"sr.scrape_books(min_rating=3, max_price=50)"
]
}
],
Expand All @@ -126,7 +274,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +288,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down
101 changes: 101 additions & 0 deletions scraping_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import pandas as pd
from bs4 import BeautifulSoup
import requests

nb_pages = 50
base_url = "https://books.toscrape.com/catalogue/"


def get_book_url(product):
book_url = product.find('a').get('href')
return book_url

def get_book_title(book_soup):
return book_soup.find('h1').text

def get_book_information(book_soup):
product_information = {}
information_table = book_soup.find('table')
details = information_table.find_all('th')
for detail in details:
if detail.text == 'UPC':
product_information['upc'] = detail.find_next_sibling().text
if detail.text == 'Price (incl. tax)':
product_information['price'] = float(detail.find_next_sibling().text[1:])
if detail.text == 'Availability':
product_information['availability'] = detail.find_next_sibling().text
return product_information

def get_book_genre(book_soup):
return book_soup.select_one('body > div > div > ul > li:nth-child(3) > a').text

def get_book_description(book_soup):
try:
return book_soup.select_one('#content_inner > article > p').text
except:
return 'no description'

def get_book_rating(book_element):
rating_map = {'One' : 1,
'Two' : 2,
'Three' : 3,
'Four' : 4,
'Five' : 5}
try:
return rating_map[book_element.select_one('p').get('class')[1]]
except:
return pd.nan

def extract_books(books, base_url, min_rating, max_price):
books_dict = {}
index = 0
for book in books:
book_url = get_book_url(book)
response = requests.get(base_url + book_url)
product_page = BeautifulSoup(response.content, "html.parser")
info = get_book_information(product_page)
rating = get_book_rating(book)
if max_price >= info['price'] and min_rating <= rating:
books_dict[index] = {
"title": get_book_title(product_page),
"genre": get_book_genre(product_page),
"UPC": info['upc'],
"Price": info['price'],
"Availability": info['availability'],
"rating": rating,
"description": get_book_description(product_page)
}
index += 1
return books_dict

def clean_data(df):
# Extract the number inside parentheses using regex and convert to int
df['Availability'] = df['Availability'].str.extract(r'\((\d+)')
df['Availability'] = pd.to_numeric(df['Availability'], errors='coerce')
return df

def scrape_books(min_rating, max_price):
books_dataframes_list = []

for page_number in range(nb_pages):
# Print current page number being processed
print(page_number)
# Construct URL for current catalog page
catalog_url = f"page-{page_number + 1}.html"
# Fetch the catalog page
response = requests.get(base_url + catalog_url)
# Parse HTML content using BeautifulSoup
catalog_page = BeautifulSoup(response.content, "html.parser")
# Find all book articles on the page
books = catalog_page.find_all('article', class_='product_pod')
# Extract book information from products using helper function
books_dict = extract_books(books, base_url, min_rating, max_price)
# Convert dictionary of books to pandas DataFrame
df = pd.DataFrame.from_dict(books_dict, orient='index')
# Append DataFrame to list of all books
books_dataframes_list.append(df)

books_df = pd.concat(books_dataframes_list, ignore_index=True)
books_df = clean_data(books_df)

return books_df