From 57ee79bae71536e57efd1e9bc99d5a98d80792fc Mon Sep 17 00:00:00 2001 From: Eduardo Romero Date: Sat, 22 Nov 2025 18:23:21 +0100 Subject: [PATCH] Trabajo en la rama solucion --- lab-web-scraping.ipynb | 73 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..7446b2f 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -117,7 +117,78 @@ }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import textwrap\n", + "import time # para evitar bloqueos por exceso de peticiones\n", + "\n", + "# Diccionario para convertir palabras de rating a números\n", + "rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n", + "\n", + "# Función principal que acepta parámetros\n", + "def scrape_books(min_rating, max_price):\n", + " all_books = [] # Lista para guardar los libros\n", + "\n", + " # Recorremos las 50 páginas del sitio\n", + " for page in range(1, 51):\n", + " url = f'http://books.toscrape.com/catalogue/page-{page}.html'\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # Encontramos todos los libros en la página\n", + " books = soup.find_all('article', class_='product_pod')\n", + "\n", + " for book in books:\n", + " title = book.h3.a['title']\n", + " price = float(book.find('p', class_='price_color').text[2:])\n", + " rating_word = book.find('p', class_='star-rating')['class'][1]\n", + " rating = rating_map[rating_word]\n", + "\n", + " # Aplicamos los filtros usando los parámetros\n", + " if rating >= min_rating and price <= max_price:\n", + " link = book.h3.a['href']\n", + " detail_url = 'http://books.toscrape.com/catalogue/' + link\n", + " detail_response = requests.get(detail_url)\n", + " detail_soup = BeautifulSoup(detail_response.text, 'html.parser')\n", + "\n", + " # Extraemos los datos adicionales\n", + " upc = detail_soup.find('th', string='UPC').find_next('td').text\n", + " genre = detail_soup.select('ul.breadcrumb li')[2].text.strip()\n", + " availability = detail_soup.find('p', class_='instock availability').text.strip()\n", + "\n", + " desc_tag = detail_soup.find('div', id='product_description')\n", + " if desc_tag:\n", + " description = desc_tag.find_next('p').text.strip()\n", + " else:\n", + " description = 'No description available'\n", + "\n", + " # Formateamos la descripción para que se vea bonita\n", + " description = textwrap.fill(description, width=60)\n", + "\n", + " # Guardamos los datos en un diccionario\n", + " book_info = {\n", + " 'UPC': upc,\n", + " 'Title': title,\n", + " 'Price (£)': price,\n", + " 'Rating': rating,\n", + " 'Genre': genre,\n", + " 'Availability': availability,\n", + " 'Description': description\n", + " }\n", + "\n", + " # Añadimos el libro a la lista\n", + " all_books.append(book_info)\n", + "\n", + " # Esperamos medio segundo para no saturar el servidor\n", + " time.sleep(0.5)\n", + "\n", + " # Convertimos la lista en un DataFrame\n", + " return pd.DataFrame(all_books)\n", + "\n", + "# Ejemplo de uso\n", + "books_df = scrape_books(min_rating=4, max_price=20)\n", + "print(books_df.head()) # Mostramos los primeros libros" ] } ],