Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
340 changes: 336 additions & 4 deletions lab-web-scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,355 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "40359eee-9cd7-4884-bfa4-83344c222305",
"metadata": {
"id": "40359eee-9cd7-4884-bfa4-83344c222305"
},
"outputs": [],
"source": [
"# Your solution goes here"
"import pandas as pd\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9e22f8b1",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import requests\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urljoin\n",
"\n",
"BASE = \"http://books.toscrape.com/\"\n",
"CATALOGUE = urljoin(BASE, \"catalogue/\")\n",
"\n",
"HEADERS = {\"User-Agent\": \"Mozilla/5.0\"}\n",
"TIMEOUT = 10\n",
"\n",
"def fetch_soup(url):\n",
" r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)\n",
" r.raise_for_status()\n",
" return BeautifulSoup(r.content, \"html.parser\")\n",
"\n",
"def get_book_url():\n",
" all_book_url = []\n",
" for number in range(1, 51):\n",
" page_url = urljoin(CATALOGUE, f\"page-{number}.html\")\n",
" try:\n",
" soup = fetch_soup(page_url)\n",
" except requests.RequestException as e:\n",
" print(f\"page not found {page_url} -> {e}\")\n",
" continue\n",
"\n",
" for a in soup.select(\"article.product_pod h3 a\"):\n",
" href = a.get(\"href\")\n",
" \n",
" book_url = urljoin(page_url, href)\n",
" all_book_url.append(book_url)\n",
"\n",
" return all_book_url\n",
"\n",
"def text_or_none(el):\n",
" return el.get_text(strip=True) if el else None\n",
"\n",
"def parse_price(text):\n",
" \n",
" if not text:\n",
" return None\n",
" m = re.search(r\"[\\d.]+\", text)\n",
" return float(m.group()) if m else None\n",
"\n",
"def get_book_data(min_rating, max_price):\n",
" all_book_url = get_book_url()\n",
" word_to_num = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n",
"\n",
" rows = []\n",
"\n",
" for url in all_book_url:\n",
" try:\n",
" soup = fetch_soup(url)\n",
" except requests.RequestException:\n",
" continue\n",
"\n",
" \n",
" title = text_or_none(soup.select_one(\"div.product_main h1\"))\n",
"\n",
" \n",
" rating_word = None\n",
" rating_p = soup.select_one(\"div.product_main p.star-rating\")\n",
" if rating_p and \"class\" in rating_p.attrs and len(rating_p[\"class\"]) > 1:\n",
" rating_word = rating_p[\"class\"][1]\n",
" rating_num = word_to_num.get(rating_word, 0)\n",
"\n",
" \n",
" table = {row.th.get_text(strip=True): row.td.get_text(strip=True)\n",
" for row in soup.select(\"table.table.table-striped tr\")}\n",
" upc = table.get(\"UPC\")\n",
" price_incl = table.get(\"Price (incl. tax)\")\n",
" availability = table.get(\"Availability\")\n",
"\n",
" price_val = parse_price(price_incl)\n",
"\n",
" \n",
" crumbs = [c.get_text(strip=True) for c in soup.select(\"ul.breadcrumb li a\")]\n",
" genre = crumbs[2] if len(crumbs) >= 3 else None\n",
"\n",
" \n",
" desc_header = soup.select_one(\"#product_description\")\n",
" if desc_header:\n",
" desc = desc_header.find_next(\"p\")\n",
" description = text_or_none(desc)\n",
" else:\n",
" description = None\n",
"\n",
" \n",
" if (price_val is not None) and (rating_num is not None):\n",
" if (price_val <= max_price) and (rating_num >= min_rating):\n",
" rows.append({\n",
" \"title\": title,\n",
" \"upc\": upc,\n",
" \"price_taxIncluded\": price_incl,\n",
" \"rating\": rating_word,\n",
" \"genre\": genre,\n",
" \"availability\": availability,\n",
" \"description\": description,\n",
" \"url\": url\n",
" })\n",
"\n",
" df = pd.DataFrame(rows, columns=[\n",
" \"title\",\"upc\",\"price_taxIncluded\",\"rating\",\"genre\",\"availability\",\"description\",\"url\"\n",
" ])\n",
" return df\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "f51de7cb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>upc</th>\n",
" <th>price_taxIncluded</th>\n",
" <th>rating</th>\n",
" <th>genre</th>\n",
" <th>availability</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Set Me Free</td>\n",
" <td>ce6396b0f23f6ecc</td>\n",
" <td>£17.46</td>\n",
" <td>Five</td>\n",
" <td>Young Adult</td>\n",
" <td>In stock (19 available)</td>\n",
" <td>Aaron Ledbetter’s future had been planned out ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>The Four Agreements: A Practical Guide to Pers...</td>\n",
" <td>6258a1f6a6dcfe50</td>\n",
" <td>£17.66</td>\n",
" <td>Five</td>\n",
" <td>Spirituality</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>In The Four Agreements, don Miguel Ruiz reveal...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>Sophie's World</td>\n",
" <td>6be3beb0793a53e7</td>\n",
" <td>£15.94</td>\n",
" <td>Five</td>\n",
" <td>Philosophy</td>\n",
" <td>In stock (18 available)</td>\n",
" <td>A page-turning novel that is also an explorati...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>Untitled Collection: Sabbath Poems 2014</td>\n",
" <td>657fe5ead67a7767</td>\n",
" <td>£14.27</td>\n",
" <td>Four</td>\n",
" <td>Poetry</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>More than thirty-five years ago, when the weat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>This One Summer</td>\n",
" <td>51653ef291ab7ddc</td>\n",
" <td>£19.49</td>\n",
" <td>Four</td>\n",
" <td>Sequential Art</td>\n",
" <td>In stock (16 available)</td>\n",
" <td>Every summer, Rose goes with her mom and dad t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>913</th>\n",
" <td>The Zombie Room</td>\n",
" <td>9c96cd1329fbd82d</td>\n",
" <td>£19.69</td>\n",
" <td>Five</td>\n",
" <td>Default</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>An unlikely bond is forged between three men f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>917</th>\n",
" <td>The Silent Wife</td>\n",
" <td>b78deb463531d078</td>\n",
" <td>£12.34</td>\n",
" <td>Five</td>\n",
" <td>Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>A chilling psychological thriller about a marr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>934</th>\n",
" <td>The Girl You Lost</td>\n",
" <td>4280ac3eab57aa5d</td>\n",
" <td>£12.29</td>\n",
" <td>Five</td>\n",
" <td>Mystery</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Eighteen years ago your baby daughter was snat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>937</th>\n",
" <td>The Edge of Reason (Bridget Jones #2)</td>\n",
" <td>29fc016c459aeb14</td>\n",
" <td>£19.18</td>\n",
" <td>Four</td>\n",
" <td>Womens Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>Monday 27 January“7:15 a.m. Hurrah! The wilder...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>997</th>\n",
" <td>A Spy's Devotion (The Regency Spies of London #1)</td>\n",
" <td>19fec36a1dfb4c16</td>\n",
" <td>£16.97</td>\n",
" <td>Five</td>\n",
" <td>Historical Fiction</td>\n",
" <td>In stock (1 available)</td>\n",
" <td>In England’s Regency era, manners and elegance...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>75 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" title upc \\\n",
"12 Set Me Free ce6396b0f23f6ecc \n",
"30 The Four Agreements: A Practical Guide to Pers... 6258a1f6a6dcfe50 \n",
"34 Sophie's World 6be3beb0793a53e7 \n",
"47 Untitled Collection: Sabbath Poems 2014 657fe5ead67a7767 \n",
"53 This One Summer 51653ef291ab7ddc \n",
".. ... ... \n",
"913 The Zombie Room 9c96cd1329fbd82d \n",
"917 The Silent Wife b78deb463531d078 \n",
"934 The Girl You Lost 4280ac3eab57aa5d \n",
"937 The Edge of Reason (Bridget Jones #2) 29fc016c459aeb14 \n",
"997 A Spy's Devotion (The Regency Spies of London #1) 19fec36a1dfb4c16 \n",
"\n",
" price_taxIncluded rating genre availability \\\n",
"12 £17.46 Five Young Adult In stock (19 available) \n",
"30 £17.66 Five Spirituality In stock (18 available) \n",
"34 £15.94 Five Philosophy In stock (18 available) \n",
"47 £14.27 Four Poetry In stock (16 available) \n",
"53 £19.49 Four Sequential Art In stock (16 available) \n",
".. ... ... ... ... \n",
"913 £19.69 Five Default In stock (1 available) \n",
"917 £12.34 Five Fiction In stock (1 available) \n",
"934 £12.29 Five Mystery In stock (1 available) \n",
"937 £19.18 Four Womens Fiction In stock (1 available) \n",
"997 £16.97 Five Historical Fiction In stock (1 available) \n",
"\n",
" description \n",
"12 Aaron Ledbetter’s future had been planned out ... \n",
"30 In The Four Agreements, don Miguel Ruiz reveal... \n",
"34 A page-turning novel that is also an explorati... \n",
"47 More than thirty-five years ago, when the weat... \n",
"53 Every summer, Rose goes with her mom and dad t... \n",
".. ... \n",
"913 An unlikely bond is forged between three men f... \n",
"917 A chilling psychological thriller about a marr... \n",
"934 Eighteen years ago your baby daughter was snat... \n",
"937 Monday 27 January“7:15 a.m. Hurrah! The wilder... \n",
"997 In England’s Regency era, manners and elegance... \n",
"\n",
"[75 rows x 7 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_books_lowPrice_highRating"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d7fab39",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +472,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down