diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..7a5b487 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,259 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "execution_count": 43, + "id": "e036e796", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "soup = BeautifulSoup(response.content, \"html.parser\")\n", + "import pandas as pd" ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "8a3e89d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucle para las 10 páginas\n", + "prices = []\n", + "reviews = []\n", + "urls = []\n", + "min_rating = [\"Four\", \"Five\"]\n", + "max_price = 20\n", + "\n", + "for plp in range(1, 5):\n", + " url = f\"https://books.toscrape.com/catalogue/page-{plp}.html\" # ← usar 'plp'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + " # Your solution goes here\n", + "\n", + " for price in soup.find_all(\"p\", class_=\"price_color\"):\n", + " value = float(price.text.replace(\"£\", \"\"))\n", + " prices.append(value)\n", + "\n", + " for review in soup.find_all(\"p\", class_=\"star-rating\"):\n", + " value = review.get(\"class\")[1]\n", + " reviews.append(value)\n", + "\n", + " for url in soup.find_all(\"article\", class_=\"product_pod\"):\n", + " value = url.find(\"h3\").a[\"href\"]\n", + " urls.append(value)\n", + "\n", + "\n", + "def scrape_books(min_rating, max_price):\n", + " books_data = []\n", + " for price, review, url in zip(prices, reviews, urls): # recorre a la vez\n", + " if price <= max_price:\n", + " if review in min_rating:\n", + " response = requests.get(\"https://books.toscrape.com/catalogue/\"+ url)\n", + " detail_soup = BeautifulSoup(response.text, \"html.parser\")\n", + "\n", + "\n", + " book_upc = detail_soup.find(\"th\", text=\"UPC\").find_next(\"td\").text if detail_soup.find(\"th\", text=\"UPC\") else None\n", + "\n", + " book_title = detail_soup.find(\"h1\").text if detail_soup.find(\"h1\") else None\n", + "\n", + " book_price = (\n", + " detail_soup.find(\"p\", class_=\"price_color\").text.strip().replace(\"£\", \"\").replace(\"Â\", \"\")\n", + " if detail_soup.find(\"p\", class_=\"price_color\") else None\n", + " )\n", + "\n", + " book_rating = (\n", + " detail_soup.find(\"p\", class_=\"star-rating\").get(\"class\")[1]\n", + " if detail_soup.find(\"p\", class_=\"star-rating\") else None\n", + " )\n", + "\n", + " book_genre = (\n", + " detail_soup.find(\"ul\", class_=\"breadcrumb\").find_all(\"a\")[1].text\n", + " if detail_soup.find(\"ul\", class_=\"breadcrumb\") else None\n", + " )\n", + "\n", + " book_availability = (\n", + " detail_soup.find(\"p\", class_=\"instock availability\").get_text(strip=True)\n", + " .replace(\"(\", \"\").replace(\")\", \"\").replace(\"In stock\", \"\").strip()\n", + " if detail_soup.find(\"p\", class_=\"instock availability\") else None\n", + " )\n", + "\n", + " book_description = (\n", + " detail_soup.find(\"div\", id=\"product_description\").find_next(\"p\").text\n", + " if detail_soup.find(\"div\", id=\"product_description\") else None\n", + " )\n", + "\n", + " \n", + " book = {\n", + " \"upc\": book_upc,\n", + " \"title\": book_title,\n", + " \"price\": book_price,\n", + " \"rating\": book_rating,\n", + " \"genre\": book_genre,\n", + " \"availability\": book_availability,\n", + " \"description\": book_description,\n", + " }\n", + "\n", + " books_data.append(book)\n", + "\n", + " df = pd.DataFrame(books_data)\n", + " return df\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "9c0f8bea", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\MartínPazYáñez\\AppData\\Local\\Temp\\ipykernel_6408\\874691255.py:37: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n", + " book_upc = detail_soup.find(\"th\", text=\"UPC\").find_next(\"td\").text if detail_soup.find(\"th\", text=\"UPC\") else None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + " | upc | \n", + "title | \n", + "price | \n", + "rating | \n", + "genre | \n", + "availability | \n", + "description | \n", + "
---|---|---|---|---|---|---|---|
0 | \n", + "ce6396b0f23f6ecc | \n", + "Set Me Free | \n", + "17.46 | \n", + "Five | \n", + "Books | \n", + "19 available | \n", + "Aaron Ledbetterâs future had been planned ou... | \n", + "
1 | \n", + "6258a1f6a6dcfe50 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "17.66 | \n", + "Five | \n", + "Books | \n", + "18 available | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
2 | \n", + "6be3beb0793a53e7 | \n", + "Sophie's World | \n", + "15.94 | \n", + "Five | \n", + "Books | \n", + "18 available | \n", + "A page-turning novel that is also an explorati... | \n", + "
3 | \n", + "657fe5ead67a7767 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "14.27 | \n", + "Four | \n", + "Books | \n", + "16 available | \n", + "More than thirty-five years ago, when the weat... | \n", + "
4 | \n", + "51653ef291ab7ddc | \n", + "This One Summer | \n", + "19.49 | \n", + "Four | \n", + "Books | \n", + "16 available | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "
5 | \n", + "709822d0b5bcb7f4 | \n", + "Thirst | \n", + "17.27 | \n", + "Five | \n", + "Books | \n", + "16 available | \n", + "On a searing summer Friday, Eddie Chapman has ... | \n", + "