diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..f973690 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -111,13 +111,239 @@ { "cell_type": "code", "execution_count": null, + "id": "c2caf064", + "metadata": {}, + "outputs": [], + "source": [ + "#import requests\n", + "#from bs4 import BeautifulSoup\n", + "#import pandas as pd\n", + "\n", + "#url = \"http://books.toscrape.com/\"\n", + "#response = requests.get(url)\n", + "#soup = BeautifulSoup(response.content, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "998a578e", + "metadata": {}, + "outputs": [], + "source": [ + "#books = soup.find_all('article', class_='product_pod')\n", + "#books" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "766314e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'A Light in the Attic'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#title = books[0].find('h3').find('a')['title']\n", + "#title" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df312ede", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'51.77'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#price = books[0].find('p', class_='price_color').text[1:]\n", + "#price" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4e273e76", + "metadata": {}, + "outputs": [], + "source": [ + "def get_page(url):\n", + " complete_url = f\"http://books.toscrape.com/{url}\"\n", + " response_get = requests.get(complete_url)\n", + " soup_get = BeautifulSoup(response_get.content, 'html.parser')\n", + " return soup_get" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "609eb711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "#rating = rating_stars[books[0].find('p', class_='star-rating')['class'][1]]\n", + "\n", + "#rating" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TitlePriceRatingUPCGenreAvailabilityDescription
0Set Me Free17.465ce6396b0f23f6eccYoung AdultIn stock (19 available)Aaron Ledbetter’s future had been planned out ...
\n", + "
" + ], + "text/plain": [ + " Title Price Rating UPC Genre \\\n", + "0 Set Me Free 17.46 5 ce6396b0f23f6ecc Young Adult \n", + "\n", + " Availability Description \n", + "0 In stock (19 available) Aaron Ledbetter’s future had been planned out ... " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your solution goes here" + "# Your solution goes here\n", + "def scrape_books (min_rating, max_price):\n", + " import requests\n", + " from bs4 import BeautifulSoup\n", + " import pandas as pd\n", + "\n", + " url = \"http://books.toscrape.com/\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + "\n", + " books = soup.find_all('article', class_='product_pod')\n", + "\n", + " titles = []\n", + " prices = []\n", + " ratings = []\n", + " upcs = []\n", + " genres = []\n", + " availabilities = []\n", + " descriptions = []\n", + "\n", + "#from here Sofia\n", + "\n", + " for book in books:\n", + " #title = book.find('h3').find('a')['title']\n", + " price = float(book.find('p', class_='price_color').text[1:])\n", + " rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + " rating = rating_stars[book.find('p', class_='star-rating')['class'][1]]\n", + " #url = book.find('h3').find('a').get('href')\n", + " #upc = get_page(url).find('table', class_=\"table table-striped\").find(\"td\").text\n", + "\n", + " if rating >= min_rating and price <= max_price:\n", + " title = book.find('h3').find('a')['title']\n", + " url = book.find('h3').find('a').get('href')\n", + " page_content = get_page(url)\n", + " upc = page_content.find('table', class_=\"table table-striped\").find(\"td\").text\n", + " genre = page_content.find('ul', class_=\"breadcrumb\").find_all('li')[2].text.strip()\n", + " availability = page_content.find('p', class_=\"instock availability\").get_text(strip=True)\n", + " description = page_content.find('div', id=\"product_description\").next_sibling.next_sibling.get_text(strip=True)\n", + " titles.append(title)\n", + " prices.append(price)\n", + " ratings.append(rating)\n", + " upcs.append(upc)\n", + " genres.append(genre)\n", + " availabilities.append(availability)\n", + " descriptions.append(description)\n", + "\n", + "\n", + " df = pd.DataFrame({\n", + " 'Title': titles,\n", + " 'Price': prices,\n", + " 'Rating': ratings,\n", + " 'UPC': upcs,\n", + " 'Genre': genres,\n", + " \"Availability\": availabilities,\n", + " \"Description\": descriptions\n", + " })\n", + "\n", + " return df\n", + "\n", + "scrape_books(4, 20)" ] } ], @@ -126,7 +352,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.5" } }, "nbformat": 4,