data-bootcamp-v4 · arnaurr94 · Aug 3, 2025
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -110,14 +110,183 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 302,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
       "outputs": [],
       "source": [
-        "# Your solution goes here"
+        "# Import necesary libraries\n",
+        "import requests\n",
+        "from bs4 import BeautifulSoup\n",
+        "import pandas as pd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 310,
+      "id": "606d2f30",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Define the function scrape books with the arguments min_rating and max_price\n",
+        "def scrape_books(min_rating, max_price):\n",
+        "        \n",
+        "    # Define functions to get title, url and price\n",
+        "    def get_title(book):\n",
+        "        title = book.find_all(\"a\")[1][\"title\"]\n",
+        "        return title\n",
+        "    def get_url(book):\n",
+        "        book_url = book.find_all(\"a\")[1][\"href\"]\n",
+        "        domain = \"https://books.toscrape.com/catalogue/\"\n",
+        "        return domain+book_url\n",
+        "    def get_price(book):\n",
+        "        price = book.find(\"p\", attrs = {\"class\":\"price_color\"}).get_text().replace('£','')\n",
+        "        return price\n",
+        "    def get_availability(book):\n",
+        "        availability = book.find(\"p\", attrs = {\"class\":\"instock availability\"}).get_text().replace('\\n', '')\n",
+        "        return availability\n",
+        "    \n",
+        "    # Define functions to get upc, genre, rating and description\n",
+        "    # Atention! These are to be performed in the book's soup, not the general one!\n",
+        "    def get_upc(book):\n",
+        "        upc = book.find(\"td\").get_text()\n",
+        "        return upc\n",
+        "    def get_genre(book):\n",
+        "        genre = book.find(\"ul\", attrs={'class':'breadcrumb'}).find_all('li')[2].get_text().strip()\n",
+        "        return genre\n",
+        "    def get_rating(book):\n",
+        "        rating = book.find_all('p')[2].get('class')[1].lower()\n",
+        "        my_nums = {'one': 1,\n",
+        "                   'two': 2,\n",
+        "                   'three': 3,\n",
+        "                   'four': 4,\n",
+        "                   'five': 5,\n",
+        "        }\n",
+        "        return my_nums[rating]\n",
+        "    def get_description(book):\n",
+        "        description = book.find_all('p')[3].get_text()\n",
+        "        return description\n",
+        "    \n",
+        "    # Create an empty dictionary for books and initialise key\n",
+        "    books_dict = {}\n",
+        "    key = 0\n",
+        "    \n",
+        "    # Get grids for all 50 pages\n",
+        "    for number in range(1,51):\n",
+        "        # Create a request to access the HTML code of each page\n",
+        "        url = f\"https://books.toscrape.com/catalogue/page-{str(number)}.html\"\n",
+        "        response = requests.get(url)\n",
+        "        # Create a soup object from the HTML code \n",
+        "        soup = BeautifulSoup(response.content, 'html.parser')\n",
+        "    \n",
+        "        # Find all the books in a given page\n",
+        "        book_grid = soup.find(\"ol\", attrs = {\"class\":\"row\"})\n",
+        "        # Inside of the grid identify all books\n",
+        "        books = book_grid.find_all(\"li\", attrs = {\"class\":\"col-xs-6 col-sm-4 col-md-3 col-lg-3\"})\n",
+        "    \n",
+        "        # Iterate functions to all books \n",
+        "        for book in books:\n",
+        "            title = get_title(book)\n",
+        "            price = get_price(book)\n",
+        "            availability = get_availability(book)\n",
+        "            book_url = get_url(book)\n",
+        "    \n",
+        "            ## Access the book url to retrieve more data.\n",
+        "            book_response = requests.get(book_url)\n",
+        "            ## 1st generate a soup for the book\n",
+        "            book_soup = BeautifulSoup(book_response.content, 'html.parser')\n",
+        "    \n",
+        "            ## Start gathering info from the book's page\n",
+        "            upc = get_upc(book_soup)\n",
+        "            genre = get_genre(book_soup)\n",
+        "            description = get_description(book_soup)\n",
+        "            rating = get_rating(book_soup)\n",
+        "    \n",
+        "            #Finally, save the retrieved data in a dictionary\n",
+        "            books_dict[key] = {\"title\": title,\n",
+        "                               \"genre\": genre,\n",
+        "                               \"price_£\": price,\n",
+        "                               \"availability\": availability,\n",
+        "                               \"rating\": rating,\n",
+        "                               \"upc\": upc,\n",
+        "                               \"description\": description,\n",
+        "          }\n",
+        "            key += 1\n",
+        "    # Turn the dictionary into a dataframe    \n",
+        "    books_df = pd.DataFrame.from_dict(books_dict, orient = \"index\")\n",
+        "    books_df['price_£'] = books_df['price_£'].astype('float32')\n",
+        "    condition = (books_df['rating'] >= min_rating) & (books_df['price_£'] <= max_price)\n",
+        "    return books_df[condition]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 309,
+      "id": "663372a7",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>title</th>\n",
+              "      <th>genre</th>\n",
+              "      <th>price_£</th>\n",
+              "      <th>availability</th>\n",
+              "      <th>rating</th>\n",
+              "      <th>upc</th>\n",
+              "      <th>description</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>12</th>\n",
+              "      <td>Set Me Free</td>\n",
+              "      <td>Young Adult</td>\n",
+              "      <td>17.459999</td>\n",
+              "      <td>In stock</td>\n",
+              "      <td>5</td>\n",
+              "      <td>ce6396b0f23f6ecc</td>\n",
+              "      <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "          title        genre    price_£              availability  rating  \\\n",
+              "12  Set Me Free  Young Adult  17.459999              In stock           5   \n",
+              "\n",
+              "                 upc                                        description  \n",
+              "12  ce6396b0f23f6ecc  Aaron Ledbetter’s future had been planned out ...  "
+            ]
+          },
+          "execution_count": 309,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "scrape_books(4,20)"
       ]
     }
   ],
@@ -126,7 +295,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +309,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,