diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fe55e38
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+2.6_web_scraping-3.ipynb
diff --git a/__pycache__/scraping_utils.cpython-313.pyc b/__pycache__/scraping_utils.cpython-313.pyc
new file mode 100644
index 0000000..cf8c096
Binary files /dev/null and b/__pycache__/scraping_utils.cpython-313.pyc differ
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
index e552783..f1e08dc 100644
--- a/lab-web-scraping.ipynb
+++ b/lab-web-scraping.ipynb
@@ -110,14 +110,162 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " genre | \n",
+ " UPC | \n",
+ " Price | \n",
+ " Availability | \n",
+ " rating | \n",
+ " description | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Sharp Objects | \n",
+ " Mystery | \n",
+ " e00eb4fd7b871a48 | \n",
+ " 47.82 | \n",
+ " 20 | \n",
+ " 4 | \n",
+ " WICKED above her hipbone, GIRL across her hear... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " The Dirty Little Secrets of Getting Your Dream... | \n",
+ " Business | \n",
+ " 2597b5a345f45e1b | \n",
+ " 33.34 | \n",
+ " 19 | \n",
+ " 4 | \n",
+ " Drawing on his extensive experience evaluating... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " The Coming Woman: A Novel Based on the Life of... | \n",
+ " Default | \n",
+ " e72a5dfc7e9267b2 | \n",
+ " 17.93 | \n",
+ " 19 | \n",
+ " 3 | \n",
+ " \"If you have a heart, if you have a soul, Kare... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " The Boys in the Boat: Nine Americans and Their... | \n",
+ " Default | \n",
+ " e10e1e165dc8be4a | \n",
+ " 22.60 | \n",
+ " 19 | \n",
+ " 4 | \n",
+ " For readers of Laura Hillenbrand's Seabiscuit ... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Shakespeare's Sonnets | \n",
+ " Poetry | \n",
+ " 30a7f60cd76ca58c | \n",
+ " 20.66 | \n",
+ " 19 | \n",
+ " 4 | \n",
+ " This book is an important and complete collect... | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Set Me Free | \n",
+ " Young Adult | \n",
+ " ce6396b0f23f6ecc | \n",
+ " 17.46 | \n",
+ " 19 | \n",
+ " 5 | \n",
+ " Aaron Ledbetter’s future had been planned out ... | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Rip it Up and Start Again | \n",
+ " Music | \n",
+ " a34ba96d4081e6a4 | \n",
+ " 35.02 | \n",
+ " 19 | \n",
+ " 5 | \n",
+ " Punk's raw power rejuvenated rock, but by the ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " title genre \\\n",
+ "0 Sharp Objects Mystery \n",
+ "1 The Dirty Little Secrets of Getting Your Dream... Business \n",
+ "2 The Coming Woman: A Novel Based on the Life of... Default \n",
+ "3 The Boys in the Boat: Nine Americans and Their... Default \n",
+ "4 Shakespeare's Sonnets Poetry \n",
+ "5 Set Me Free Young Adult \n",
+ "6 Rip it Up and Start Again Music \n",
+ "\n",
+ " UPC Price Availability rating \\\n",
+ "0 e00eb4fd7b871a48 47.82 20 4 \n",
+ "1 2597b5a345f45e1b 33.34 19 4 \n",
+ "2 e72a5dfc7e9267b2 17.93 19 3 \n",
+ "3 e10e1e165dc8be4a 22.60 19 4 \n",
+ "4 30a7f60cd76ca58c 20.66 19 4 \n",
+ "5 ce6396b0f23f6ecc 17.46 19 5 \n",
+ "6 a34ba96d4081e6a4 35.02 19 5 \n",
+ "\n",
+ " description \n",
+ "0 WICKED above her hipbone, GIRL across her hear... \n",
+ "1 Drawing on his extensive experience evaluating... \n",
+ "2 \"If you have a heart, if you have a soul, Kare... \n",
+ "3 For readers of Laura Hillenbrand's Seabiscuit ... \n",
+ "4 This book is an important and complete collect... \n",
+ "5 Aaron Ledbetter’s future had been planned out ... \n",
+ "6 Punk's raw power rejuvenated rock, but by the ... "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your solution goes here"
+ "import scraping_utils as sr\n",
+ "\n",
+ "sr.scrape_books(min_rating=3, max_price=50)"
]
}
],
@@ -126,7 +274,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -140,7 +288,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.13.5"
}
},
"nbformat": 4,
diff --git a/scraping_utils.py b/scraping_utils.py
new file mode 100644
index 0000000..2992df8
--- /dev/null
+++ b/scraping_utils.py
@@ -0,0 +1,101 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import requests
+
+nb_pages = 50
+base_url = "https://books.toscrape.com/catalogue/"
+
+
+def get_book_url(product):
+ book_url = product.find('a').get('href')
+ return book_url
+
+def get_book_title(book_soup):
+ return book_soup.find('h1').text
+
+def get_book_information(book_soup):
+ product_information = {}
+ information_table = book_soup.find('table')
+ details = information_table.find_all('th')
+ for detail in details:
+ if detail.text == 'UPC':
+ product_information['upc'] = detail.find_next_sibling().text
+ if detail.text == 'Price (incl. tax)':
+ product_information['price'] = float(detail.find_next_sibling().text[1:])
+ if detail.text == 'Availability':
+ product_information['availability'] = detail.find_next_sibling().text
+ return product_information
+
+def get_book_genre(book_soup):
+ return book_soup.select_one('body > div > div > ul > li:nth-child(3) > a').text
+
+def get_book_description(book_soup):
+ try:
+ return book_soup.select_one('#content_inner > article > p').text
+ except:
+ return 'no description'
+
+def get_book_rating(book_element):
+ rating_map = {'One' : 1,
+ 'Two' : 2,
+ 'Three' : 3,
+ 'Four' : 4,
+ 'Five' : 5}
+ try:
+ return rating_map[book_element.select_one('p').get('class')[1]]
+ except:
+ return pd.nan
+
+def extract_books(books, base_url, min_rating, max_price):
+ books_dict = {}
+ index = 0
+ for book in books:
+ book_url = get_book_url(book)
+ response = requests.get(base_url + book_url)
+ product_page = BeautifulSoup(response.content, "html.parser")
+ info = get_book_information(product_page)
+ rating = get_book_rating(book)
+ if max_price >= info['price'] and min_rating <= rating:
+ books_dict[index] = {
+ "title": get_book_title(product_page),
+ "genre": get_book_genre(product_page),
+ "UPC": info['upc'],
+ "Price": info['price'],
+ "Availability": info['availability'],
+ "rating": rating,
+ "description": get_book_description(product_page)
+ }
+ index += 1
+ return books_dict
+
+def clean_data(df):
+ # Extract the number inside parentheses using regex and convert to int
+ df['Availability'] = df['Availability'].str.extract(r'\((\d+)')
+ df['Availability'] = pd.to_numeric(df['Availability'], errors='coerce')
+ return df
+
+def scrape_books(min_rating, max_price):
+ books_dataframes_list = []
+
+ for page_number in range(nb_pages):
+ # Print current page number being processed
+ print(page_number)
+ # Construct URL for current catalog page
+ catalog_url = f"page-{page_number + 1}.html"
+ # Fetch the catalog page
+ response = requests.get(base_url + catalog_url)
+ # Parse HTML content using BeautifulSoup
+ catalog_page = BeautifulSoup(response.content, "html.parser")
+ # Find all book articles on the page
+ books = catalog_page.find_all('article', class_='product_pod')
+ # Extract book information from products using helper function
+ books_dict = extract_books(books, base_url, min_rating, max_price)
+ # Convert dictionary of books to pandas DataFrame
+ df = pd.DataFrame.from_dict(books_dict, orient='index')
+ # Append DataFrame to list of all books
+ books_dataframes_list.append(df)
+
+ books_df = pd.concat(books_dataframes_list, ignore_index=True)
+ books_df = clean_data(books_df)
+
+ return books_df
\ No newline at end of file