diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..5bbbfa8 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,15 +110,347 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "40359eee-9cd7-4884-bfa4-83344c222305", "metadata": { "id": "40359eee-9cd7-4884-bfa4-83344c222305" }, "outputs": [], "source": [ - "# Your solution goes here" + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9e22f8b1", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import requests\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup\n", + "from urllib.parse import urljoin\n", + "\n", + "BASE = \"http://books.toscrape.com/\"\n", + "CATALOGUE = urljoin(BASE, \"catalogue/\")\n", + "\n", + "HEADERS = {\"User-Agent\": \"Mozilla/5.0\"}\n", + "TIMEOUT = 10\n", + "\n", + "def fetch_soup(url):\n", + " r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)\n", + " r.raise_for_status()\n", + " return BeautifulSoup(r.content, \"html.parser\")\n", + "\n", + "def get_book_url():\n", + " all_book_url = []\n", + " for number in range(1, 51):\n", + " page_url = urljoin(CATALOGUE, f\"page-{number}.html\")\n", + " try:\n", + " soup = fetch_soup(page_url)\n", + " except requests.RequestException as e:\n", + " print(f\"page not found {page_url} -> {e}\")\n", + " continue\n", + "\n", + " for a in soup.select(\"article.product_pod h3 a\"):\n", + " href = a.get(\"href\")\n", + " \n", + " book_url = urljoin(page_url, href)\n", + " all_book_url.append(book_url)\n", + "\n", + " return all_book_url\n", + "\n", + "def text_or_none(el):\n", + " return el.get_text(strip=True) if el else None\n", + "\n", + "def parse_price(text):\n", + " \n", + " if not text:\n", + " return None\n", + " m = re.search(r\"[\\d.]+\", text)\n", + " return float(m.group()) if m else None\n", + "\n", + "def get_book_data(min_rating, max_price):\n", + " all_book_url = get_book_url()\n", + " word_to_num = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", + "\n", + " rows = []\n", + "\n", + " for url in all_book_url:\n", + " try:\n", + " soup = fetch_soup(url)\n", + " except requests.RequestException:\n", + " continue\n", + "\n", + " \n", + " title = text_or_none(soup.select_one(\"div.product_main h1\"))\n", + "\n", + " \n", + " rating_word = None\n", + " rating_p = soup.select_one(\"div.product_main p.star-rating\")\n", + " if rating_p and \"class\" in rating_p.attrs and len(rating_p[\"class\"]) > 1:\n", + " rating_word = rating_p[\"class\"][1]\n", + " rating_num = word_to_num.get(rating_word, 0)\n", + "\n", + " \n", + " table = {row.th.get_text(strip=True): row.td.get_text(strip=True)\n", + " for row in soup.select(\"table.table.table-striped tr\")}\n", + " upc = table.get(\"UPC\")\n", + " price_incl = table.get(\"Price (incl. tax)\")\n", + " availability = table.get(\"Availability\")\n", + "\n", + " price_val = parse_price(price_incl)\n", + "\n", + " \n", + " crumbs = [c.get_text(strip=True) for c in soup.select(\"ul.breadcrumb li a\")]\n", + " genre = crumbs[2] if len(crumbs) >= 3 else None\n", + "\n", + " \n", + " desc_header = soup.select_one(\"#product_description\")\n", + " if desc_header:\n", + " desc = desc_header.find_next(\"p\")\n", + " description = text_or_none(desc)\n", + " else:\n", + " description = None\n", + "\n", + " \n", + " if (price_val is not None) and (rating_num is not None):\n", + " if (price_val <= max_price) and (rating_num >= min_rating):\n", + " rows.append({\n", + " \"title\": title,\n", + " \"upc\": upc,\n", + " \"price_taxIncluded\": price_incl,\n", + " \"rating\": rating_word,\n", + " \"genre\": genre,\n", + " \"availability\": availability,\n", + " \"description\": description,\n", + " \"url\": url\n", + " })\n", + "\n", + " df = pd.DataFrame(rows, columns=[\n", + " \"title\",\"upc\",\"price_taxIncluded\",\"rating\",\"genre\",\"availability\",\"description\",\"url\"\n", + " ])\n", + " return df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f51de7cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | title | \n", + "upc | \n", + "price_taxIncluded | \n", + "rating | \n", + "genre | \n", + "availability | \n", + "description | \n", + "
---|---|---|---|---|---|---|---|
12 | \n", + "Set Me Free | \n", + "ce6396b0f23f6ecc | \n", + "£17.46 | \n", + "Five | \n", + "Young Adult | \n", + "In stock (19 available) | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "
30 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "6258a1f6a6dcfe50 | \n", + "£17.66 | \n", + "Five | \n", + "Spirituality | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
34 | \n", + "Sophie's World | \n", + "6be3beb0793a53e7 | \n", + "£15.94 | \n", + "Five | \n", + "Philosophy | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "
47 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "657fe5ead67a7767 | \n", + "£14.27 | \n", + "Four | \n", + "Poetry | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "
53 | \n", + "This One Summer | \n", + "51653ef291ab7ddc | \n", + "£19.49 | \n", + "Four | \n", + "Sequential Art | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
913 | \n", + "The Zombie Room | \n", + "9c96cd1329fbd82d | \n", + "£19.69 | \n", + "Five | \n", + "Default | \n", + "In stock (1 available) | \n", + "An unlikely bond is forged between three men f... | \n", + "
917 | \n", + "The Silent Wife | \n", + "b78deb463531d078 | \n", + "£12.34 | \n", + "Five | \n", + "Fiction | \n", + "In stock (1 available) | \n", + "A chilling psychological thriller about a marr... | \n", + "
934 | \n", + "The Girl You Lost | \n", + "4280ac3eab57aa5d | \n", + "£12.29 | \n", + "Five | \n", + "Mystery | \n", + "In stock (1 available) | \n", + "Eighteen years ago your baby daughter was snat... | \n", + "
937 | \n", + "The Edge of Reason (Bridget Jones #2) | \n", + "29fc016c459aeb14 | \n", + "£19.18 | \n", + "Four | \n", + "Womens Fiction | \n", + "In stock (1 available) | \n", + "Monday 27 January“7:15 a.m. Hurrah! The wilder... | \n", + "
997 | \n", + "A Spy's Devotion (The Regency Spies of London #1) | \n", + "19fec36a1dfb4c16 | \n", + "£16.97 | \n", + "Five | \n", + "Historical Fiction | \n", + "In stock (1 available) | \n", + "In England’s Regency era, manners and elegance... | \n", + "
75 rows × 7 columns
\n", + "