From f5350119a89735ea632ef1087a7cecb44a777872 Mon Sep 17 00:00:00 2001
From: Mauricio Bengochea Torres <Masteramida@gmail.com>
Date: Fri, 12 Sep 2025 13:56:29 +0200
Subject: [PATCH] Lab solution to lab-web-scraping.ipynb

---
 lab-web-scraping.ipynb | 110 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 5 deletions(-)

diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
index e552783..816d64c 100644
--- a/lab-web-scraping.ipynb
+++ b/lab-web-scraping.ipynb
@@ -110,14 +110,114 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 4,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "C:\\Users\\LAP-MPC\\AppData\\Local\\Temp\\ipykernel_10456\\493026631.py:42: DeprecationWarning: The 'text' argument to find()-type methods is deprecated. Use 'string' instead.\n",
+            "  upc = book_soup.find(text='UPC').find_next('td').text\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                UPC                                              Title  \\\n",
+            "0  ce6396b0f23f6ecc                                        Set Me Free   \n",
+            "1  6258a1f6a6dcfe50  The Four Agreements: A Practical Guide to Pers...   \n",
+            "2  6be3beb0793a53e7                                     Sophie's World   \n",
+            "3  657fe5ead67a7767            Untitled Collection: Sabbath Poems 2014   \n",
+            "4  51653ef291ab7ddc                                    This One Summer   \n",
+            "\n",
+            "   Price (£)  Rating           Genre             Availability  \\\n",
+            "0      17.46       5     Young Adult  In stock (19 available)   \n",
+            "1      17.66       5    Spirituality  In stock (18 available)   \n",
+            "2      15.94       5      Philosophy  In stock (18 available)   \n",
+            "3      14.27       4          Poetry  In stock (16 available)   \n",
+            "4      19.49       4  Sequential Art  In stock (16 available)   \n",
+            "\n",
+            "                                         Description  \n",
+            "0  Aaron Ledbetterâs future had been planned ou...  \n",
+            "1  In The Four Agreements, don Miguel Ruiz reveal...  \n",
+            "2  A page-turning novel that is also an explorati...  \n",
+            "3  More than thirty-five years ago, when the weat...  \n",
+            "4  Every summer, Rose goes with her mom and dad t...  \n"
+          ]
+        }
+      ],
       "source": [
-        "# Your solution goes here"
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "import pandas as pd\n",
+        "\n",
+        "def get_rating(rating_class):\n",
+        "    \"\"\"Convert star rating class names to numerical ratings.\"\"\"\n",
+        "    ratings_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}\n",
+        "    return ratings_map.get(rating_class, 0)\n",
+        "\n",
+        "def scrape_books(min_rating=4.0, max_price=20.0):\n",
+        "    base_url = 'http://books.toscrape.com/catalogue/page-{}.html'\n",
+        "    books_data = []\n",
+        "\n",
+        "    page = 1\n",
+        "    while True:\n",
+        "        response = requests.get(base_url.format(page))\n",
+        "        soup = BeautifulSoup(response.text, 'html.parser')\n",
+        "        \n",
+        "        # Check if there are books on the current page\n",
+        "        books = soup.select('article.product_pod')\n",
+        "        if not books:\n",
+        "            break\n",
+        "\n",
+        "        for book in books:\n",
+        "            rating_class = book.p['class'][1]\n",
+        "            rating = get_rating(rating_class)\n",
+        "            title = book.h3.a['title']\n",
+        "            \n",
+        "            # Properly handle the price by removing non-numeric prefixes\n",
+        "            price_text = book.find('p', class_='price_color').text\n",
+        "            price_cleaned = ''.join(filter(lambda x: x.isdigit() or x=='.', price_text))\n",
+        "            price = float(price_cleaned)\n",
+        "\n",
+        "            if rating >= min_rating and price <= max_price:\n",
+        "                # Extract the link and fetch additional book details\n",
+        "                relative_link = book.h3.a['href'].replace('../../../', '')\n",
+        "                book_url = f\"http://books.toscrape.com/catalogue/{relative_link}\"\n",
+        "                book_response = requests.get(book_url)\n",
+        "                book_soup = BeautifulSoup(book_response.text, 'html.parser')\n",
+        "                \n",
+        "                # Extract additional book details\n",
+        "                upc = book_soup.find(text='UPC').find_next('td').text\n",
+        "                genre = book_soup.find('ul', class_='breadcrumb').findAll('a')[2].text\n",
+        "                availability = book_soup.find('p', class_='instock availability').text.strip()\n",
+        "                description_tag = book_soup.find('meta', attrs={'name': 'description'})\n",
+        "                description = description_tag['content'].strip() if description_tag else 'N/A'\n",
+        "\n",
+        "                books_data.append({\n",
+        "                    'UPC': upc,\n",
+        "                    'Title': title,\n",
+        "                    'Price (£)': price,\n",
+        "                    'Rating': rating,\n",
+        "                    'Genre': genre,\n",
+        "                    'Availability': availability,\n",
+        "                    'Description': description\n",
+        "                })\n",
+        "\n",
+        "        page += 1\n",
+        "\n",
+        "    # Create a DataFrame with the book data\n",
+        "    books_df = pd.DataFrame(books_data)\n",
+        "    return books_df\n",
+        "\n",
+        "# Execute the function\n",
+        "books_df_result = scrape_books(min_rating=4.0, max_price=20.0)\n",
+        "print(books_df_result.head())"
       ]
     }
   ],
@@ -126,7 +226,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +240,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,