diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..b8b197f 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,623 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import time " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "03dd2cb9", + "metadata": {}, + "outputs": [], + "source": [ + "# First we import the necessary libraries, in this case url of book shop/// After, with soup we parse the content of the page\n", + "url = \"https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html\"\n", + "resp = requests.get(url)\n", + "soup = BeautifulSoup(resp.content, 'html.parser')" + ] + }, + { + "cell_type": "markdown", + "id": "a3823309", + "metadata": {}, + "source": [ + "# 1º Extract the information of one book " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "269cb1bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A Light in the Attic\n" + ] + } + ], + "source": [ + "# Next step: We extract the title of the book\n", + "title = soup.find(\"h1\").text\n", + "print(title)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2bf905b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "£51.77\n" + ] + } + ], + "source": [ + "# Next, we extract the price \n", + "price = soup.find(\"p\", class_= \"price_color\").get_text()\n", + "print(price)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "80b7257a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In stock (22 available)\n" + ] + } + ], + "source": [ + "# Next, extract the disponibility \n", + "disponibility = soup.find(\"p\", class_ = \"instock availability\").get_text().strip()\n", + "print(disponibility)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "684aa2e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rating: 3\n" + ] + } + ], + "source": [ + "# Next, extract the rating\n", + "rating_tag = soup.find(\"p\", class_=\"star-rating\")\n", + "# rating_tag[\"class\"] is like: [\"star-rating\", \"Four\"]\n", + "rating_word = rating_tag[\"class\"][1] # \"Four\"\n", + "\n", + "# Convert to number\n", + "rating_map = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", + "rating = rating_map[rating_word]\n", + "print(\"Rating:\", rating)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6d74d29a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UPC: a897fe39b1053632\n" + ] + } + ], + "source": [ + "# We found all rows in the table\n", + "rows = soup.find_all(\"tr\")\n", + "\n", + "# We loop until we find the one that has \"UPC\"\n", + "upc = None\n", + "for row in rows:\n", + " if row.find(\"th\").get_text() == \"UPC\":\n", + " upc = row.find(\"td\").get_text()\n", + " break\n", + "\n", + "print(\"UPC:\", upc)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6e32bdca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Género: Poetry\n" + ] + } + ], + "source": [ + "# Extract the genre of the book\n", + "\n", + "breadcrumb = soup.select(\"ul.breadcrumb li\")\n", + "genre = breadcrumb[2].get_text(strip=True) \n", + "print(\"Género:\", genre)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a31bb5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Description: It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and...\n" + ] + } + ], + "source": [ + "# Finally, extract the description\n", + "description_tag = soup.find(\"div\", id=\"product_description\")\n", + "if description_tag:\n", + " # El siguiente

después del

dentro del div\n", + " description = description_tag.find_next(\"p\").get_text()\n", + "else:\n", + " description = \"\"\n", + "\n", + "print(\"Description:\", description[:100] + \"...\") " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a93c7fb9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'UPC': 'a897fe39b1053632', 'Title': 'A Light in the Attic', 'Price (£)': '£51.77', 'Rating': 3, 'Genre': 'Poetry', 'Availability': 'In stock (22 available)', 'Description': \"It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more\"}\n" + ] + } + ], + "source": [ + "book = {\n", + " \"UPC\": upc,\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": genre,\n", + " \"Availability\": disponibility,\n", + " \"Description\": description\n", + "}\n", + "\n", + "print(book)" + ] + }, + { + "cell_type": "markdown", + "id": "e2d14e0c", + "metadata": {}, + "source": [ + "# 2º Apply the process to all books " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "df291080", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Found 20 book links.\n", + "1. https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html\n", + "2. https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html\n", + "3. https://books.toscrape.com/catalogue/soumission_998/index.html\n" + ] + } + ], + "source": [ + "# Get book links from first page \n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "url = \"https://books.toscrape.com/\"\n", + "response = requests.get(url)\n", + "soup = BeautifulSoup(response.content, 'html.parser')\n", + "\n", + "links = []\n", + "for article in soup.select(\"article.product_pod\"):\n", + " href = article.find(\"h3\").find(\"a\")[\"href\"]\n", + " full_url = \"https://books.toscrape.com/\" + href\n", + " links.append(full_url)\n", + "\n", + "print(f\" Found {len(links)} book links.\")\n", + "for i, link in enumerate(links[:3], 1):\n", + " print(f\"{i}. {link}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2a9c9223", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " A Light in the Attic...\n", + " Tipping the Velvet...\n", + " Soumission...\n", + " Sharp Objects...\n", + " Sapiens: A Brief History of Hu...\n", + " The Requiem Red...\n", + " The Dirty Little Secrets of Ge...\n", + " The Coming Woman: A Novel Base...\n", + " The Boys in the Boat: Nine Ame...\n", + " The Black Maria...\n", + " Starving Hearts (Triangular Tr...\n", + " Shakespeare's Sonnets...\n", + " Set Me Free...\n", + " Scott Pilgrim's Precious Littl...\n", + " Rip it Up and Start Again...\n", + " Our Band Could Be Your Life: S...\n", + " Olio...\n", + " Mesaerion: The Best Science Fi...\n", + " Libertarianism for Beginners...\n", + " It's Only the Himalayas...\n", + "\n", + " Total books scraped: 20\n" + ] + } + ], + "source": [ + "# Scrape all 20 books (no filters) \n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def rating_to_number(word):\n", + " return {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}.get(word, 0)\n", + "\n", + "# Get links\n", + "url = \"https://books.toscrape.com/\"\n", + "response = requests.get(url)\n", + "soup = BeautifulSoup(response.content, 'html.parser')\n", + "links = [\n", + " \"https://books.toscrape.com/\" + art.find(\"h3\").find(\"a\")[\"href\"]\n", + " for art in soup.select(\"article.product_pod\")\n", + "]\n", + "\n", + "# Scrape each book\n", + "books = []\n", + "for link in links:\n", + " resp = requests.get(link)\n", + " s = BeautifulSoup(resp.content, 'html.parser')\n", + " \n", + " book = {\n", + " \"UPC\": s.find(\"th\", string=\"UPC\").find_next(\"td\").get_text(strip=True),\n", + " \"Title\": s.find(\"h1\").get_text(strip=True),\n", + " \"Price (£)\": float(s.find(\"p\", class_=\"price_color\").get_text().replace(\"£\", \"\")),\n", + " \"Rating\": rating_to_number(s.find(\"p\", class_=\"star-rating\")[\"class\"][1]),\n", + " \"Genre\": s.select(\"ul.breadcrumb li\")[2].get_text(strip=True),\n", + " \"Availability\": s.find(\"p\", class_=\"instock availability\").get_text(strip=True),\n", + " \"Description\": (\n", + " s.find(\"div\", id=\"product_description\").find_next(\"p\").get_text(strip=True)\n", + " if s.find(\"div\", id=\"product_description\") and s.find(\"div\", id=\"product_description\").find_next(\"p\")\n", + " else \"\"\n", + " )\n", + " }\n", + " books.append(book)\n", + " print(f\" {book['Title'][:30]}...\")\n", + "\n", + "print(f\"\\n Total books scraped: {len(books)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "398f1b65", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter books by min_rating and max_price \n", + "def rating_to_number(word):\n", + " return {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}.get(word, 0)\n", + "\n", + "def scrape_books(min_rating: float, max_price: float):\n", + " # Get book links (first page only)\n", + " url = \"https://books.toscrape.com/\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " links = [\n", + " \"https://books.toscrape.com/\" + art.find(\"h3\").find(\"a\")[\"href\"]\n", + " for art in soup.select(\"article.product_pod\")\n", + " ]\n", + "\n", + " filtered_books = []\n", + " for link in links:\n", + " resp = requests.get(link)\n", + " s = BeautifulSoup(resp.content, 'html.parser')\n", + " \n", + " title = s.find(\"h1\").get_text(strip=True)\n", + " price = float(s.find(\"p\", class_=\"price_color\").get_text().replace(\"£\", \"\"))\n", + " rating = rating_to_number(s.find(\"p\", class_=\"star-rating\")[\"class\"][1])\n", + " \n", + " # Apply filters\n", + " if rating >= min_rating and price <= max_price:\n", + " book = {\n", + " \"UPC\": s.find(\"th\", string=\"UPC\").find_next(\"td\").get_text(strip=True),\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": s.select(\"ul.breadcrumb li\")[2].get_text(strip=True),\n", + " \"Availability\": s.find(\"p\", class_=\"instock availability\").get_text(strip=True),\n", + " \"Description\": (\n", + " s.find(\"div\", id=\"product_description\").find_next(\"p\").get_text(strip=True)\n", + " if s.find(\"div\", id=\"product_description\") and s.find(\"div\", id=\"product_description\").find_next(\"p\")\n", + " else \"\"\n", + " )\n", + " }\n", + " filtered_books.append(book)\n", + " print(f\" Kept: {title[:30]}... (Rating: {rating}, Price: £{price})\")\n", + " else:\n", + " print(f\" Skipped: {title[:30]}... (Rating: {rating}, Price: £{price})\")\n", + " \n", + " return filtered_books\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "25d024b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Kept: Set Me Free... (Rating: 5, Price: £17.46)\n", + " UPC Title Price (£) Rating Genre \\\n", + "0 ce6396b0f23f6ecc Set Me Free 17.46 5 Young Adult \n", + "\n", + " Availability Description \n", + "0 In stock (19 available) Aaron Ledbetter’s future had been planned out ... \n", + "\n", + " Total de libros encontrados: 1\n" + ] + } + ], + "source": [ + "# STEP 5: Convert filtered books to a pandas DataFrame \n", + "\n", + "def rating_to_number(word):\n", + " return {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}.get(word, 0)\n", + "\n", + "def scrape_books(min_rating: float, max_price: float) -> pd.DataFrame:\n", + " # Get book links (first page only)\n", + " url = \"https://books.toscrape.com/\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " links = [\n", + " \"https://books.toscrape.com/\" + art.find(\"h3\").find(\"a\")[\"href\"]\n", + " for art in soup.select(\"article.product_pod\")\n", + " ]\n", + "\n", + " filtered_books = []\n", + " for link in links:\n", + " resp = requests.get(link)\n", + " s = BeautifulSoup(resp.content, 'html.parser')\n", + " \n", + " title = s.find(\"h1\").get_text(strip=True)\n", + " price = float(s.find(\"p\", class_=\"price_color\").get_text().replace(\"£\", \"\"))\n", + " rating = rating_to_number(s.find(\"p\", class_=\"star-rating\")[\"class\"][1])\n", + " \n", + " # Apply filters\n", + " if rating >= min_rating and price <= max_price:\n", + " book = {\n", + " \"UPC\": s.find(\"th\", string=\"UPC\").find_next(\"td\").get_text(strip=True),\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": s.select(\"ul.breadcrumb li\")[2].get_text(strip=True),\n", + " \"Availability\": s.find(\"p\", class_=\"instock availability\").get_text(strip=True),\n", + " \"Description\": (\n", + " s.find(\"div\", id=\"product_description\").find_next(\"p\").get_text(strip=True)\n", + " if s.find(\"div\", id=\"product_description\") and s.find(\"div\", id=\"product_description\").find_next(\"p\")\n", + " else \"\"\n", + " )\n", + " }\n", + " filtered_books.append(book)\n", + " print(f\" Kept: {title[:30]}... (Rating: {rating}, Price: £{price})\")\n", + " \n", + "# Convert list of dicts to pandas DataFrame\n", + " df = pd.DataFrame(filtered_books, columns=[\n", + " \"UPC\",\n", + " \"Title\",\n", + " \"Price (£)\",\n", + " \"Rating\",\n", + " \"Genre\",\n", + " \"Availability\",\n", + " \"Description\"\n", + " ])\n", + " return df\n", + "\n", + "# Example \n", + "df = scrape_books(min_rating=5.0, max_price=30.0)\n", + "print(df)\n", + "print(f\"\\n Total de libros encontrados: {len(df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ebb17863", + "metadata": {}, + "outputs": [], + "source": [ + "# STEP 6: Scrape from ALL pages \n", + "\n", + "def rating_to_number(word):\n", + " return {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}.get(word, 0)\n", + "\n", + "def get_all_book_links():\n", + " \"\"\"Collect links to every book on every page.\"\"\"\n", + " base_url = \"https://books.toscrape.com/catalogue/\"\n", + " current_page_url = \"https://books.toscrape.com/catalogue/page-1.html\"\n", + " all_links = []\n", + "\n", + " while True:\n", + " print(f\"Scraping: {current_page_url}\")\n", + " response = requests.get(current_page_url)\n", + " if response.status_code != 200:\n", + " break # No more pages\n", + "\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " articles = soup.select(\"article.product_pod\")\n", + " \n", + " for article in articles:\n", + " href = article.find(\"h3\").find(\"a\")[\"href\"]\n", + " # Clean relative path and build full URL\n", + " full_url = \"https://books.toscrape.com/catalogue/\" + href.replace(\"../\", \"\")\n", + " all_links.append(full_url)\n", + "\n", + " # Check for \"next\" page\n", + " next_button = soup.select_one(\"li.next a\")\n", + " if next_button:\n", + " next_href = next_button[\"href\"]\n", + " # Build next page URL relative to current base\n", + " if \"page-\" in current_page_url:\n", + " current_base = current_page_url.rsplit(\"/\", 1)[0] + \"/\"\n", + " else:\n", + " current_base = \"https://books.toscrape.com/catalogue/\"\n", + " current_page_url = current_base + next_href\n", + " else:\n", + " break # No more pages\n", + "\n", + " return all_links\n", + "\n", + "def scrape_books(min_rating: float, max_price: float) -> pd.DataFrame:\n", + " \"\"\"Scrape and filter books from ALL pages.\"\"\"\n", + " print(\" Collecting all book links...\")\n", + " all_links = get_all_book_links()\n", + " print(f\" Found {len(all_links)} books across all pages.\")\n", + "\n", + " filtered_books = []\n", + " for i, link in enumerate(all_links, 1):\n", + " try:\n", + " resp = requests.get(link)\n", + " s = BeautifulSoup(resp.content, 'html.parser')\n", + "\n", + " title = s.find(\"h1\").get_text(strip=True)\n", + " price = float(s.find(\"p\", class_=\"price_color\").get_text().replace(\"£\", \"\"))\n", + " rating_word = s.find(\"p\", class_=\"star-rating\")[\"class\"][1]\n", + " rating = rating_to_number(rating_word)\n", + "\n", + " # Apply filters early to save time\n", + " if rating >= min_rating and price <= max_price:\n", + " book = {\n", + " \"UPC\": s.find(\"th\", string=\"UPC\").find_next(\"td\").get_text(strip=True),\n", + " \"Title\": title,\n", + " \"Price (£)\": price,\n", + " \"Rating\": rating,\n", + " \"Genre\": s.select(\"ul.breadcrumb li\")[2].get_text(strip=True),\n", + " \"Availability\": s.find(\"p\", class_=\"instock availability\").get_text(strip=True),\n", + " \"Description\": (\n", + " s.find(\"div\", id=\"product_description\").find_next(\"p\").get_text(strip=True)\n", + " if s.find(\"div\", id=\"product_description\") and s.find(\"div\", id=\"product_description\").find_next(\"p\")\n", + " else \"\"\n", + " )\n", + " }\n", + " filtered_books.append(book)\n", + " print(f\" [{i}/{len(all_links)}] Kept: {title[:40]}...\")\n", + "\n", + " except Exception as e:\n", + " print(f\" [{i}/{len(all_links)}] Error on {link}: {e}\")\n", + " continue\n", + "\n", + " df = pd.DataFrame(filtered_books, columns=[\n", + " \"UPC\", \"Title\", \"Price (£)\", \"Rating\", \"Genre\", \"Availability\", \"Description\"\n", + " ])\n", + " return df\n", + "\n", + "\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "228fdbb8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Kept: The Coming Woman: A Novel Base... (Rating: 3, Price: £17.93)\n", + " Kept: The Boys in the Boat: Nine Ame... (Rating: 4, Price: £22.6)\n", + " Kept: Shakespeare's Sonnets... (Rating: 4, Price: £20.66)\n", + " Kept: Set Me Free... (Rating: 5, Price: £17.46)\n", + " UPC Title \\\n", + "0 e72a5dfc7e9267b2 The Coming Woman: A Novel Based on the Life of... \n", + "1 e10e1e165dc8be4a The Boys in the Boat: Nine Americans and Their... \n", + "2 30a7f60cd76ca58c Shakespeare's Sonnets \n", + "3 ce6396b0f23f6ecc Set Me Free \n", + "\n", + " Price (£) Rating Genre Availability \\\n", + "0 17.93 3 Default In stock (19 available) \n", + "1 22.60 4 Default In stock (19 available) \n", + "2 20.66 4 Poetry In stock (19 available) \n", + "3 17.46 5 Young Adult In stock (19 available) \n", + "\n", + " Description \n", + "0 \"If you have a heart, if you have a soul, Kare... \n", + "1 For readers of Laura Hillenbrand's Seabiscuit ... \n", + "2 This book is an important and complete collect... \n", + "3 Aaron Ledbetter’s future had been planned out ... \n", + "\n", + " Total de libros encontrados: 4\n" + ] + } + ], + "source": [ + "df = scrape_books(min_rating=3.0, max_price=30.0)\n", + "print(df)\n", + "print(f\"\\n Total de libros encontrados: {len(df)}\")" ] } ], @@ -126,7 +735,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -140,7 +749,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.10" } }, "nbformat": 4,