Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 173 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,183 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 302,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"source": [
"# Your solution goes here"
"# Import necesary libraries\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 310,
"id": "606d2f30",
"metadata": {},
"outputs": [],
"source": [
"# Define the function scrape books with the arguments min_rating and max_price\n",
"def scrape_books(min_rating, max_price):\n",
" \n",
" # Define functions to get title, url and price\n",
" def get_title(book):\n",
" title = book.find_all(\"a\")[1][\"title\"]\n",
" return title\n",
" def get_url(book):\n",
" book_url = book.find_all(\"a\")[1][\"href\"]\n",
" domain = \"https://books.toscrape.com/catalogue/\"\n",
" return domain+book_url\n",
" def get_price(book):\n",
" price = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text().replace('£','')\n",
" return price\n",
" def get_availability(book):\n",
" availability = book.find(\"p\", attrs = {\"class\":\"instock availability\"}).get_text().replace('\\n', '')\n",
" return availability\n",
" \n",
" # Define functions to get upc, genre, rating and description\n",
" # Atention! These are to be performed in the book's soup, not the general one!\n",
" def get_upc(book):\n",
" upc = book.find(\"td\").get_text()\n",
" return upc\n",
" def get_genre(book):\n",
" genre = book.find(\"ul\", attrs={'class':'breadcrumb'}).find_all('li')[2].get_text().strip()\n",
" return genre\n",
" def get_rating(book):\n",
" rating = book.find_all('p')[2].get('class')[1].lower()\n",
" my_nums = {'one': 1,\n",
" 'two': 2,\n",
" 'three': 3,\n",
" 'four': 4,\n",
" 'five': 5,\n",
" }\n",
" return my_nums[rating]\n",
" def get_description(book):\n",
" description = book.find_all('p')[3].get_text()\n",
" return description\n",
" \n",
" # Create an empty dictionary for books and initialise key\n",
" books_dict = {}\n",
" key = 0\n",
" \n",
" # Get grids for all 50 pages\n",
" for number in range(1,51):\n",
" # Create a request to access the HTML code of each page\n",
" url = f\"https://books.toscrape.com/catalogue/page-{str(number)}.html\"\n",
" response = requests.get(url)\n",
" # Create a soup object from the HTML code \n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" \n",
" # Find all the books in a given page\n",
" book_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n",
" # Inside of the grid identify all books\n",
" books = book_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})\n",
" \n",
" # Iterate functions to all books \n",
" for book in books:\n",
" title = get_title(book)\n",
" price = get_price(book)\n",
" availability = get_availability(book)\n",
" book_url = get_url(book)\n",
" \n",
" ## Access the book url to retrieve more data.\n",
" book_response = requests.get(book_url)\n",
" ## 1st generate a soup for the book\n",
" book_soup = BeautifulSoup(book_response.content, 'html.parser')\n",
" \n",
" ## Start gathering info from the book's page\n",
" upc = get_upc(book_soup)\n",
" genre = get_genre(book_soup)\n",
" description = get_description(book_soup)\n",
" rating = get_rating(book_soup)\n",
" \n",
" #Finally, save the retrieved data in a dictionary\n",
" books_dict[key] = {\"title\": title,\n",
" \"genre\": genre,\n",
" \"price_£\": price,\n",
" \"availability\": availability,\n",
" \"rating\": rating,\n",
" \"upc\": upc,\n",
" \"description\": description,\n",
" }\n",
" key += 1\n",
" # Turn the dictionary into a dataframe \n",
" books_df = pd.DataFrame.from_dict(books_dict, orient = \"index\")\n",
" books_df['price_£'] = books_df['price_£'].astype('float32')\n",
" condition = (books_df['rating'] >= min_rating) & (books_df['price_£'] <= max_price)\n",
" return books_df[condition]"
]
},
{
"cell_type": "code",
"execution_count": 309,
"id": "663372a7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>genre</th>\n",
" <th>price_£</th>\n",
" <th>availability</th>\n",
" <th>rating</th>\n",
" <th>upc</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Set Me Free</td>\n",
" <td>Young Adult</td>\n",
" <td>17.459999</td>\n",
" <td>In stock</td>\n",
" <td>5</td>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title genre price_£ availability rating \\\n",
"12 Set Me Free Young Adult 17.459999 In stock 5 \n",
"\n",
" upc description \n",
"12 ce6396b0f23f6ecc Aaron Ledbetter’s future had been planned out ... "
]
},
"execution_count": 309,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scrape_books(4,20)"
]
}
],
Expand All @@ -126,7 +295,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +309,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down