diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fe55e38 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +2.6_web_scraping-3.ipynb diff --git a/__pycache__/scraping_utils.cpython-313.pyc b/__pycache__/scraping_utils.cpython-313.pyc new file mode 100644 index 0000000..cf8c096 Binary files /dev/null and b/__pycache__/scraping_utils.cpython-313.pyc differ diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..f1e08dc 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,162 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenreUPCPriceAvailabilityratingdescription
0Sharp ObjectsMysterye00eb4fd7b871a4847.82204WICKED above her hipbone, GIRL across her hear...
1The Dirty Little Secrets of Getting Your Dream...Business2597b5a345f45e1b33.34194Drawing on his extensive experience evaluating...
2The Coming Woman: A Novel Based on the Life of...Defaulte72a5dfc7e9267b217.93193\"If you have a heart, if you have a soul, Kare...
3The Boys in the Boat: Nine Americans and Their...Defaulte10e1e165dc8be4a22.60194For readers of Laura Hillenbrand's Seabiscuit ...
4Shakespeare's SonnetsPoetry30a7f60cd76ca58c20.66194This book is an important and complete collect...
5Set Me FreeYoung Adultce6396b0f23f6ecc17.46195Aaron Ledbetter’s future had been planned out ...
6Rip it Up and Start AgainMusica34ba96d4081e6a435.02195Punk's raw power rejuvenated rock, but by the ...
\n", + "
" + ], + "text/plain": [ + " title genre \\\n", + "0 Sharp Objects Mystery \n", + "1 The Dirty Little Secrets of Getting Your Dream... Business \n", + "2 The Coming Woman: A Novel Based on the Life of... Default \n", + "3 The Boys in the Boat: Nine Americans and Their... Default \n", + "4 Shakespeare's Sonnets Poetry \n", + "5 Set Me Free Young Adult \n", + "6 Rip it Up and Start Again Music \n", + "\n", + " UPC Price Availability rating \\\n", + "0 e00eb4fd7b871a48 47.82 20 4 \n", + "1 2597b5a345f45e1b 33.34 19 4 \n", + "2 e72a5dfc7e9267b2 17.93 19 3 \n", + "3 e10e1e165dc8be4a 22.60 19 4 \n", + "4 30a7f60cd76ca58c 20.66 19 4 \n", + "5 ce6396b0f23f6ecc 17.46 19 5 \n", + "6 a34ba96d4081e6a4 35.02 19 5 \n", + "\n", + " description \n", + "0 WICKED above her hipbone, GIRL across her hear... \n", + "1 Drawing on his extensive experience evaluating... \n", + "2 \"If you have a heart, if you have a soul, Kare... \n", + "3 For readers of Laura Hillenbrand's Seabiscuit ... \n", + "4 This book is an important and complete collect... \n", + "5 Aaron Ledbetter’s future had been planned out ... \n", + "6 Punk's raw power rejuvenated rock, but by the ... " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your solution goes here" + "import scraping_utils as sr\n", + "\n", + "sr.scrape_books(min_rating=3, max_price=50)" ] } ], @@ -126,7 +274,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +288,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/scraping_utils.py b/scraping_utils.py new file mode 100644 index 0000000..2992df8 --- /dev/null +++ b/scraping_utils.py @@ -0,0 +1,101 @@ +import pandas as pd +from bs4 import BeautifulSoup +import requests + +nb_pages = 50 +base_url = "https://books.toscrape.com/catalogue/" + + +def get_book_url(product): + book_url = product.find('a').get('href') + return book_url + +def get_book_title(book_soup): + return book_soup.find('h1').text + +def get_book_information(book_soup): + product_information = {} + information_table = book_soup.find('table') + details = information_table.find_all('th') + for detail in details: + if detail.text == 'UPC': + product_information['upc'] = detail.find_next_sibling().text + if detail.text == 'Price (incl. tax)': + product_information['price'] = float(detail.find_next_sibling().text[1:]) + if detail.text == 'Availability': + product_information['availability'] = detail.find_next_sibling().text + return product_information + +def get_book_genre(book_soup): + return book_soup.select_one('body > div > div > ul > li:nth-child(3) > a').text + +def get_book_description(book_soup): + try: + return book_soup.select_one('#content_inner > article > p').text + except: + return 'no description' + +def get_book_rating(book_element): + rating_map = {'One' : 1, + 'Two' : 2, + 'Three' : 3, + 'Four' : 4, + 'Five' : 5} + try: + return rating_map[book_element.select_one('p').get('class')[1]] + except: + return pd.nan + +def extract_books(books, base_url, min_rating, max_price): + books_dict = {} + index = 0 + for book in books: + book_url = get_book_url(book) + response = requests.get(base_url + book_url) + product_page = BeautifulSoup(response.content, "html.parser") + info = get_book_information(product_page) + rating = get_book_rating(book) + if max_price >= info['price'] and min_rating <= rating: + books_dict[index] = { + "title": get_book_title(product_page), + "genre": get_book_genre(product_page), + "UPC": info['upc'], + "Price": info['price'], + "Availability": info['availability'], + "rating": rating, + "description": get_book_description(product_page) + } + index += 1 + return books_dict + +def clean_data(df): + # Extract the number inside parentheses using regex and convert to int + df['Availability'] = df['Availability'].str.extract(r'\((\d+)') + df['Availability'] = pd.to_numeric(df['Availability'], errors='coerce') + return df + +def scrape_books(min_rating, max_price): + books_dataframes_list = [] + + for page_number in range(nb_pages): + # Print current page number being processed + print(page_number) + # Construct URL for current catalog page + catalog_url = f"page-{page_number + 1}.html" + # Fetch the catalog page + response = requests.get(base_url + catalog_url) + # Parse HTML content using BeautifulSoup + catalog_page = BeautifulSoup(response.content, "html.parser") + # Find all book articles on the page + books = catalog_page.find_all('article', class_='product_pod') + # Extract book information from products using helper function + books_dict = extract_books(books, base_url, min_rating, max_price) + # Convert dictionary of books to pandas DataFrame + df = pd.DataFrame.from_dict(books_dict, orient='index') + # Append DataFrame to list of all books + books_dataframes_list.append(df) + + books_df = pd.concat(books_dataframes_list, ignore_index=True) + books_df = clean_data(books_df) + + return books_df \ No newline at end of file