diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..ac058e0 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,183 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 302, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "# Import necesary libraries\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 310, + "id": "606d2f30", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the function scrape books with the arguments min_rating and max_price\n", + "def scrape_books(min_rating, max_price):\n", + " \n", + " # Define functions to get title, url and price\n", + " def get_title(book):\n", + " title = book.find_all(\"a\")[1][\"title\"]\n", + " return title\n", + " def get_url(book):\n", + " book_url = book.find_all(\"a\")[1][\"href\"]\n", + " domain = \"https://books.toscrape.com/catalogue/\"\n", + " return domain+book_url\n", + " def get_price(book):\n", + " price = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text().replace('£','')\n", + " return price\n", + " def get_availability(book):\n", + " availability = book.find(\"p\", attrs = {\"class\":\"instock availability\"}).get_text().replace('\\n', '')\n", + " return availability\n", + " \n", + " # Define functions to get upc, genre, rating and description\n", + " # Atention! These are to be performed in the book's soup, not the general one!\n", + " def get_upc(book):\n", + " upc = book.find(\"td\").get_text()\n", + " return upc\n", + " def get_genre(book):\n", + " genre = book.find(\"ul\", attrs={'class':'breadcrumb'}).find_all('li')[2].get_text().strip()\n", + " return genre\n", + " def get_rating(book):\n", + " rating = book.find_all('p')[2].get('class')[1].lower()\n", + " my_nums = {'one': 1,\n", + " 'two': 2,\n", + " 'three': 3,\n", + " 'four': 4,\n", + " 'five': 5,\n", + " }\n", + " return my_nums[rating]\n", + " def get_description(book):\n", + " description = book.find_all('p')[3].get_text()\n", + " return description\n", + " \n", + " # Create an empty dictionary for books and initialise key\n", + " books_dict = {}\n", + " key = 0\n", + " \n", + " # Get grids for all 50 pages\n", + " for number in range(1,51):\n", + " # Create a request to access the HTML code of each page\n", + " url = f\"https://books.toscrape.com/catalogue/page-{str(number)}.html\"\n", + " response = requests.get(url)\n", + " # Create a soup object from the HTML code \n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " \n", + " # Find all the books in a given page\n", + " book_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n", + " # Inside of the grid identify all books\n", + " books = book_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})\n", + " \n", + " # Iterate functions to all books \n", + " for book in books:\n", + " title = get_title(book)\n", + " price = get_price(book)\n", + " availability = get_availability(book)\n", + " book_url = get_url(book)\n", + " \n", + " ## Access the book url to retrieve more data.\n", + " book_response = requests.get(book_url)\n", + " ## 1st generate a soup for the book\n", + " book_soup = BeautifulSoup(book_response.content, 'html.parser')\n", + " \n", + " ## Start gathering info from the book's page\n", + " upc = get_upc(book_soup)\n", + " genre = get_genre(book_soup)\n", + " description = get_description(book_soup)\n", + " rating = get_rating(book_soup)\n", + " \n", + " #Finally, save the retrieved data in a dictionary\n", + " books_dict[key] = {\"title\": title,\n", + " \"genre\": genre,\n", + " \"price_£\": price,\n", + " \"availability\": availability,\n", + " \"rating\": rating,\n", + " \"upc\": upc,\n", + " \"description\": description,\n", + " }\n", + " key += 1\n", + " # Turn the dictionary into a dataframe \n", + " books_df = pd.DataFrame.from_dict(books_dict, orient = \"index\")\n", + " books_df['price_£'] = books_df['price_£'].astype('float32')\n", + " condition = (books_df['rating'] >= min_rating) & (books_df['price_£'] <= max_price)\n", + " return books_df[condition]" + ] + }, + { + "cell_type": "code", + "execution_count": 309, + "id": "663372a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenreprice_£availabilityratingupcdescription
12Set Me FreeYoung Adult17.459999In stock5ce6396b0f23f6eccAaron Ledbetter’s future had been planned out ...
\n", + "
" + ], + "text/plain": [ + " title genre price_£ availability rating \\\n", + "12 Set Me Free Young Adult 17.459999 In stock 5 \n", + "\n", + " upc description \n", + "12 ce6396b0f23f6ecc Aaron Ledbetter’s future had been planned out ... " + ] + }, + "execution_count": 309, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scrape_books(4,20)" ] } ], @@ -126,7 +295,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,