From 6ae498078002ac7b794147f4fd27cd4ce64cdb39 Mon Sep 17 00:00:00 2001 From: martin-paz-y Date: Sun, 28 Sep 2025 13:06:17 +0200 Subject: [PATCH] Solved Lab --- lab-web-scraping.ipynb | 260 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 252 insertions(+), 8 deletions(-) diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..7a5b487 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,259 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "execution_count": 43, + "id": "e036e796", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "soup = BeautifulSoup(response.content, \"html.parser\")\n", + "import pandas as pd" ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "8a3e89d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucle para las 10 páginas\n", + "prices = []\n", + "reviews = []\n", + "urls = []\n", + "min_rating = [\"Four\", \"Five\"]\n", + "max_price = 20\n", + "\n", + "for plp in range(1, 5):\n", + " url = f\"https://books.toscrape.com/catalogue/page-{plp}.html\" # ← usar 'plp'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + " # Your solution goes here\n", + "\n", + " for price in soup.find_all(\"p\", class_=\"price_color\"):\n", + " value = float(price.text.replace(\"£\", \"\"))\n", + " prices.append(value)\n", + "\n", + " for review in soup.find_all(\"p\", class_=\"star-rating\"):\n", + " value = review.get(\"class\")[1]\n", + " reviews.append(value)\n", + "\n", + " for url in soup.find_all(\"article\", class_=\"product_pod\"):\n", + " value = url.find(\"h3\").a[\"href\"]\n", + " urls.append(value)\n", + "\n", + "\n", + "def scrape_books(min_rating, max_price):\n", + " books_data = []\n", + " for price, review, url in zip(prices, reviews, urls): # recorre a la vez\n", + " if price <= max_price:\n", + " if review in min_rating:\n", + " response = requests.get(\"https://books.toscrape.com/catalogue/\"+ url)\n", + " detail_soup = BeautifulSoup(response.text, \"html.parser\")\n", + "\n", + "\n", + " book_upc = detail_soup.find(\"th\", text=\"UPC\").find_next(\"td\").text if detail_soup.find(\"th\", text=\"UPC\") else None\n", + "\n", + " book_title = detail_soup.find(\"h1\").text if detail_soup.find(\"h1\") else None\n", + "\n", + " book_price = (\n", + " detail_soup.find(\"p\", class_=\"price_color\").text.strip().replace(\"£\", \"\").replace(\"Â\", \"\")\n", + " if detail_soup.find(\"p\", class_=\"price_color\") else None\n", + " )\n", + "\n", + " book_rating = (\n", + " detail_soup.find(\"p\", class_=\"star-rating\").get(\"class\")[1]\n", + " if detail_soup.find(\"p\", class_=\"star-rating\") else None\n", + " )\n", + "\n", + " book_genre = (\n", + " detail_soup.find(\"ul\", class_=\"breadcrumb\").find_all(\"a\")[1].text\n", + " if detail_soup.find(\"ul\", class_=\"breadcrumb\") else None\n", + " )\n", + "\n", + " book_availability = (\n", + " detail_soup.find(\"p\", class_=\"instock availability\").get_text(strip=True)\n", + " .replace(\"(\", \"\").replace(\")\", \"\").replace(\"In stock\", \"\").strip()\n", + " if detail_soup.find(\"p\", class_=\"instock availability\") else None\n", + " )\n", + "\n", + " book_description = (\n", + " detail_soup.find(\"div\", id=\"product_description\").find_next(\"p\").text\n", + " if detail_soup.find(\"div\", id=\"product_description\") else None\n", + " )\n", + "\n", + " \n", + " book = {\n", + " \"upc\": book_upc,\n", + " \"title\": book_title,\n", + " \"price\": book_price,\n", + " \"rating\": book_rating,\n", + " \"genre\": book_genre,\n", + " \"availability\": book_availability,\n", + " \"description\": book_description,\n", + " }\n", + "\n", + " books_data.append(book)\n", + "\n", + " df = pd.DataFrame(books_data)\n", + " return df\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "9c0f8bea", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\MartínPazYáñez\\AppData\\Local\\Temp\\ipykernel_6408\\874691255.py:37: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n", + " book_upc = detail_soup.find(\"th\", text=\"UPC\").find_next(\"td\").text if detail_soup.find(\"th\", text=\"UPC\") else None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
upctitlepriceratinggenreavailabilitydescription
0ce6396b0f23f6eccSet Me Free17.46FiveBooks19 availableAaron Ledbetter’s future had been planned ou...
16258a1f6a6dcfe50The Four Agreements: A Practical Guide to Pers...17.66FiveBooks18 availableIn The Four Agreements, don Miguel Ruiz reveal...
26be3beb0793a53e7Sophie's World15.94FiveBooks18 availableA page-turning novel that is also an explorati...
3657fe5ead67a7767Untitled Collection: Sabbath Poems 201414.27FourBooks16 availableMore than thirty-five years ago, when the weat...
451653ef291ab7ddcThis One Summer19.49FourBooks16 availableEvery summer, Rose goes with her mom and dad t...
5709822d0b5bcb7f4Thirst17.27FiveBooks16 availableOn a searing summer Friday, Eddie Chapman has ...
\n", + "
" + ], + "text/plain": [ + " upc title price \\\n", + "0 ce6396b0f23f6ecc Set Me Free 17.46 \n", + "1 6258a1f6a6dcfe50 The Four Agreements: A Practical Guide to Pers... 17.66 \n", + "2 6be3beb0793a53e7 Sophie's World 15.94 \n", + "3 657fe5ead67a7767 Untitled Collection: Sabbath Poems 2014 14.27 \n", + "4 51653ef291ab7ddc This One Summer 19.49 \n", + "5 709822d0b5bcb7f4 Thirst 17.27 \n", + "\n", + " rating genre availability \\\n", + "0 Five Books 19 available \n", + "1 Five Books 18 available \n", + "2 Five Books 18 available \n", + "3 Four Books 16 available \n", + "4 Four Books 16 available \n", + "5 Five Books 16 available \n", + "\n", + " description \n", + "0 Aaron Ledbetter’s future had been planned ou... \n", + "1 In The Four Agreements, don Miguel Ruiz reveal... \n", + "2 A page-turning novel that is also an explorati... \n", + "3 More than thirty-five years ago, when the weat... \n", + "4 Every summer, Rose goes with her mom and dad t... \n", + "5 On a searing summer Friday, Eddie Chapman has ... " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = scrape_books([\"Four\", \"Five\"], 20)\n", + "display(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "726251e1", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -126,7 +370,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -140,7 +384,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.13.7" } }, "nbformat": 4,