data-bootcamp-v4 · medilin-njoki · Aug 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+2.6_web_scraping-3.ipynb
diff --git a/__pycache__/scraping_utils.cpython-313.pyc b/__pycache__/scraping_utils.cpython-313.pyc
diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb
@@ -110,14 +110,162 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 25,
       "id": "40359eee-9cd7-4884-bfa4-83344c222305",
       "metadata": {
         "id": "40359eee-9cd7-4884-bfa4-83344c222305"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>title</th>\n",
+              "      <th>genre</th>\n",
+              "      <th>UPC</th>\n",
+              "      <th>Price</th>\n",
+              "      <th>Availability</th>\n",
+              "      <th>rating</th>\n",
+              "      <th>description</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Sharp Objects</td>\n",
+              "      <td>Mystery</td>\n",
+              "      <td>e00eb4fd7b871a48</td>\n",
+              "      <td>47.82</td>\n",
+              "      <td>20</td>\n",
+              "      <td>4</td>\n",
+              "      <td>WICKED above her hipbone, GIRL across her hear...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>The Dirty Little Secrets of Getting Your Dream...</td>\n",
+              "      <td>Business</td>\n",
+              "      <td>2597b5a345f45e1b</td>\n",
+              "      <td>33.34</td>\n",
+              "      <td>19</td>\n",
+              "      <td>4</td>\n",
+              "      <td>Drawing on his extensive experience evaluating...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>The Coming Woman: A Novel Based on the Life of...</td>\n",
+              "      <td>Default</td>\n",
+              "      <td>e72a5dfc7e9267b2</td>\n",
+              "      <td>17.93</td>\n",
+              "      <td>19</td>\n",
+              "      <td>3</td>\n",
+              "      <td>\"If you have a heart, if you have a soul, Kare...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>The Boys in the Boat: Nine Americans and Their...</td>\n",
+              "      <td>Default</td>\n",
+              "      <td>e10e1e165dc8be4a</td>\n",
+              "      <td>22.60</td>\n",
+              "      <td>19</td>\n",
+              "      <td>4</td>\n",
+              "      <td>For readers of Laura Hillenbrand's Seabiscuit ...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>Shakespeare's Sonnets</td>\n",
+              "      <td>Poetry</td>\n",
+              "      <td>30a7f60cd76ca58c</td>\n",
+              "      <td>20.66</td>\n",
+              "      <td>19</td>\n",
+              "      <td>4</td>\n",
+              "      <td>This book is an important and complete collect...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>Set Me Free</td>\n",
+              "      <td>Young Adult</td>\n",
+              "      <td>ce6396b0f23f6ecc</td>\n",
+              "      <td>17.46</td>\n",
+              "      <td>19</td>\n",
+              "      <td>5</td>\n",
+              "      <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>6</th>\n",
+              "      <td>Rip it Up and Start Again</td>\n",
+              "      <td>Music</td>\n",
+              "      <td>a34ba96d4081e6a4</td>\n",
+              "      <td>35.02</td>\n",
+              "      <td>19</td>\n",
+              "      <td>5</td>\n",
+              "      <td>Punk's raw power rejuvenated rock, but by the ...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "                                               title        genre  \\\n",
+              "0                                      Sharp Objects      Mystery   \n",
+              "1  The Dirty Little Secrets of Getting Your Dream...     Business   \n",
+              "2  The Coming Woman: A Novel Based on the Life of...      Default   \n",
+              "3  The Boys in the Boat: Nine Americans and Their...      Default   \n",
+              "4                              Shakespeare's Sonnets       Poetry   \n",
+              "5                                        Set Me Free  Young Adult   \n",
+              "6                          Rip it Up and Start Again        Music   \n",
+              "\n",
+              "                UPC  Price  Availability  rating  \\\n",
+              "0  e00eb4fd7b871a48  47.82            20       4   \n",
+              "1  2597b5a345f45e1b  33.34            19       4   \n",
+              "2  e72a5dfc7e9267b2  17.93            19       3   \n",
+              "3  e10e1e165dc8be4a  22.60            19       4   \n",
+              "4  30a7f60cd76ca58c  20.66            19       4   \n",
+              "5  ce6396b0f23f6ecc  17.46            19       5   \n",
+              "6  a34ba96d4081e6a4  35.02            19       5   \n",
+              "\n",
+              "                                         description  \n",
+              "0  WICKED above her hipbone, GIRL across her hear...  \n",
+              "1  Drawing on his extensive experience evaluating...  \n",
+              "2  \"If you have a heart, if you have a soul, Kare...  \n",
+              "3  For readers of Laura Hillenbrand's Seabiscuit ...  \n",
+              "4  This book is an important and complete collect...  \n",
+              "5  Aaron Ledbetter’s future had been planned out ...  \n",
+              "6  Punk's raw power rejuvenated rock, but by the ...  "
+            ]
+          },
+          "execution_count": 25,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
-        "# Your solution goes here"
+        "import scraping_utils as sr\n",
+        "\n",
+        "sr.scrape_books(min_rating=3, max_price=50)"
       ]
     }
   ],
@@ -126,7 +274,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
+      "display_name": "base",
       "language": "python",
       "name": "python3"
     },
@@ -140,7 +288,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.13"
+      "version": "3.13.5"
     }
   },
   "nbformat": 4,

diff --git a/scraping_utils.py b/scraping_utils.py
@@ -0,0 +1,101 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import requests
+
+nb_pages = 50
+base_url = "https://books.toscrape.com/catalogue/"
+
+
+def get_book_url(product):
+    book_url = product.find('a').get('href')
+    return book_url
+
+def get_book_title(book_soup):
+    return book_soup.find('h1').text
+
+def get_book_information(book_soup):
+    product_information = {}
+    information_table = book_soup.find('table')
+    details = information_table.find_all('th')
+    for detail in details:
+        if detail.text == 'UPC':
+            product_information['upc'] = detail.find_next_sibling().text
+        if detail.text == 'Price (incl. tax)':
+            product_information['price'] = float(detail.find_next_sibling().text[1:])
+        if detail.text == 'Availability':
+            product_information['availability'] = detail.find_next_sibling().text
+    return product_information
+
+def get_book_genre(book_soup):
+    return book_soup.select_one('body > div > div > ul > li:nth-child(3) > a').text
+
+def get_book_description(book_soup):
+    try:
+        return book_soup.select_one('#content_inner > article > p').text
+    except:
+        return 'no description'
+
+def get_book_rating(book_element):
+    rating_map = {'One' : 1,
+                  'Two' : 2,
+                  'Three' : 3,
+                  'Four' : 4,
+                  'Five' : 5}
+    try:
+        return rating_map[book_element.select_one('p').get('class')[1]]
+    except:
+        return pd.nan
+
+def extract_books(books, base_url, min_rating, max_price):
+    books_dict = {}
+    index = 0
+    for book in books:
+        book_url = get_book_url(book)
+        response = requests.get(base_url + book_url)
+        product_page = BeautifulSoup(response.content, "html.parser")
+        info = get_book_information(product_page)
+        rating = get_book_rating(book)
+        if max_price >= info['price'] and min_rating <= rating:
+            books_dict[index] = {
+            "title": get_book_title(product_page),
+            "genre": get_book_genre(product_page),
+            "UPC": info['upc'],
+            "Price": info['price'],
+            "Availability": info['availability'],
+            "rating": rating,
+            "description": get_book_description(product_page)
+            }
+            index += 1
+    return books_dict
+
+def clean_data(df):
+    # Extract the number inside parentheses using regex and convert to int
+    df['Availability'] = df['Availability'].str.extract(r'\((\d+)')
+    df['Availability'] = pd.to_numeric(df['Availability'], errors='coerce')
+    return df
+
+def scrape_books(min_rating, max_price):
+    books_dataframes_list = []
+
+    for page_number in range(nb_pages):
+        # Print current page number being processed
+        print(page_number)
+        # Construct URL for current catalog page
+        catalog_url = f"page-{page_number + 1}.html"
+        # Fetch the catalog page
+        response = requests.get(base_url + catalog_url)
+        # Parse HTML content using BeautifulSoup
+        catalog_page = BeautifulSoup(response.content, "html.parser")
+        # Find all book articles on the page
+        books = catalog_page.find_all('article', class_='product_pod')
+        # Extract book information from products using helper function
+        books_dict = extract_books(books, base_url, min_rating, max_price)
+        # Convert dictionary of books to pandas DataFrame
+        df = pd.DataFrame.from_dict(books_dict, orient='index')
+        # Append DataFrame to list of all books
+        books_dataframes_list.append(df)
+
+    books_df = pd.concat(books_dataframes_list, ignore_index=True)
+    books_df = clean_data(books_df)
+
+    return books_df