diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
index e552783..f973690 100644
--- a/lab-web-scraping.ipynb
+++ b/lab-web-scraping.ipynb
@@ -111,13 +111,239 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "c2caf064",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#import requests\n",
+ "#from bs4 import BeautifulSoup\n",
+ "#import pandas as pd\n",
+ "\n",
+ "#url = \"http://books.toscrape.com/\"\n",
+ "#response = requests.get(url)\n",
+ "#soup = BeautifulSoup(response.content, 'html.parser')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "998a578e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#books = soup.find_all('article', class_='product_pod')\n",
+ "#books"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "766314e2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'A Light in the Attic'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#title = books[0].find('h3').find('a')['title']\n",
+ "#title"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df312ede",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'51.77'"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#price = books[0].find('p', class_='price_color').text[1:]\n",
+ "#price"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "4e273e76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_page(url):\n",
+ " complete_url = f\"http://books.toscrape.com/{url}\"\n",
+ " response_get = requests.get(complete_url)\n",
+ " soup_get = BeautifulSoup(response_get.content, 'html.parser')\n",
+ " return soup_get"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "609eb711",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+ "#rating = rating_stars[books[0].find('p', class_='star-rating')['class'][1]]\n",
+ "\n",
+ "#rating"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Title | \n",
+ " Price | \n",
+ " Rating | \n",
+ " UPC | \n",
+ " Genre | \n",
+ " Availability | \n",
+ " Description | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Set Me Free | \n",
+ " 17.46 | \n",
+ " 5 | \n",
+ " ce6396b0f23f6ecc | \n",
+ " Young Adult | \n",
+ " In stock (19 available) | \n",
+ " Aaron Ledbetter’s future had been planned out ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Title Price Rating UPC Genre \\\n",
+ "0 Set Me Free 17.46 5 ce6396b0f23f6ecc Young Adult \n",
+ "\n",
+ " Availability Description \n",
+ "0 In stock (19 available) Aaron Ledbetter’s future had been planned out ... "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your solution goes here"
+ "# Your solution goes here\n",
+ "def scrape_books (min_rating, max_price):\n",
+ " import requests\n",
+ " from bs4 import BeautifulSoup\n",
+ " import pandas as pd\n",
+ "\n",
+ " url = \"http://books.toscrape.com/\"\n",
+ " response = requests.get(url)\n",
+ " soup = BeautifulSoup(response.content, 'html.parser')\n",
+ "\n",
+ " books = soup.find_all('article', class_='product_pod')\n",
+ "\n",
+ " titles = []\n",
+ " prices = []\n",
+ " ratings = []\n",
+ " upcs = []\n",
+ " genres = []\n",
+ " availabilities = []\n",
+ " descriptions = []\n",
+ "\n",
+ "#from here Sofia\n",
+ "\n",
+ " for book in books:\n",
+ " #title = book.find('h3').find('a')['title']\n",
+ " price = float(book.find('p', class_='price_color').text[1:])\n",
+ " rating_stars = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+ " rating = rating_stars[book.find('p', class_='star-rating')['class'][1]]\n",
+ " #url = book.find('h3').find('a').get('href')\n",
+ " #upc = get_page(url).find('table', class_=\"table table-striped\").find(\"td\").text\n",
+ "\n",
+ " if rating >= min_rating and price <= max_price:\n",
+ " title = book.find('h3').find('a')['title']\n",
+ " url = book.find('h3').find('a').get('href')\n",
+ " page_content = get_page(url)\n",
+ " upc = page_content.find('table', class_=\"table table-striped\").find(\"td\").text\n",
+ " genre = page_content.find('ul', class_=\"breadcrumb\").find_all('li')[2].text.strip()\n",
+ " availability = page_content.find('p', class_=\"instock availability\").get_text(strip=True)\n",
+ " description = page_content.find('div', id=\"product_description\").next_sibling.next_sibling.get_text(strip=True)\n",
+ " titles.append(title)\n",
+ " prices.append(price)\n",
+ " ratings.append(rating)\n",
+ " upcs.append(upc)\n",
+ " genres.append(genre)\n",
+ " availabilities.append(availability)\n",
+ " descriptions.append(description)\n",
+ "\n",
+ "\n",
+ " df = pd.DataFrame({\n",
+ " 'Title': titles,\n",
+ " 'Price': prices,\n",
+ " 'Rating': ratings,\n",
+ " 'UPC': upcs,\n",
+ " 'Genre': genres,\n",
+ " \"Availability\": availabilities,\n",
+ " \"Description\": descriptions\n",
+ " })\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "scrape_books(4, 20)"
]
}
],
@@ -126,7 +352,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "base",
"language": "python",
"name": "python3"
},
@@ -140,7 +366,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.13.5"
}
},
"nbformat": 4,