diff --git a/lab-web-scraping.ipynb b/lab-web-scraping.ipynb index e552783..e4a626c 100644 --- a/lab-web-scraping.ipynb +++ b/lab-web-scraping.ipynb @@ -110,14 +110,305 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "40359eee-9cd7-4884-bfa4-83344c222305", - "metadata": { - "id": "40359eee-9cd7-4884-bfa4-83344c222305" - }, + "execution_count": 299, + "id": "5b2a720f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "import re\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "68e84e34", + "metadata": {}, "outputs": [], "source": [ - "# Your solution goes here" + "def get_book_url():\n", + "\n", + " all_book_url = []\n", + "\n", + " for number in range(1,51):\n", + " base_url = f\"https://books.toscrape.com/catalogue/page-{number}.html\"\n", + " response = requests.get(base_url)\n", + " if response.status_code != 200:\n", + " print(f\"page not found {base_url}\")\n", + "\n", + " soup = BeautifulSoup(response.content)\n", + " \n", + " for i in soup.find_all('h3'):\n", + " book_url = 'https://books.toscrape.com/catalogue/' + i.find('a')['href']\n", + " all_book_url.append(book_url)\n", + " \n", + " return all_book_url\n" + ] + }, + { + "cell_type": "code", + "execution_count": 302, + "id": "8c6751dd", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def get_book_data(min_rating,max_price):\n", + "\n", + " all_book_url = get_book_url() # a list of urls\n", + " word_to_num = {\"One\": 1,\"Two\": 2,\"Three\": 3,\"Four\": 4,\"Five\": 5}\n", + " book_dic = {\n", + " 'title':[],\n", + " 'upc':[],\n", + " 'price_taxIncluded':[],\n", + " 'genre':[],\n", + " 'availability':[],\n", + " 'rating':[],\n", + " 'description':[],\n", + " 'url': []\n", + " }\n", + "\n", + " df = pd.DataFrame(book_dic)\n", + "\n", + " df['url'] = all_book_url\n", + "\n", + " df['soup'] = df['url'].apply(lambda x: BeautifulSoup(requests.get(x).content))\n", + "\n", + " df['price'] = df['soup'].apply(lambda x: \n", + " int(float(re.sub(r'[^\\d.]', '', x.find_all('tr')[3].find('td').get_text()))))\n", + " df['rating'] = df['soup'].apply(lambda x: x.find('div',attrs={'class':'col-sm-6 product_main'}).find('p', class_='star-rating')['class'][1])\n", + " \n", + " df['rating_num'] = df['rating'].apply(lambda x:\n", + " word_to_num.get(x,0))\n", + "\n", + " df_filtered = df[(df['price'] <= max_price) & (df['rating_num'] >= min_rating)].copy()\n", + "\n", + " df_filtered['title'] = df_filtered['soup'].apply(lambda x: x.find('h1').get_text())\n", + "\n", + " df_filtered['upc'] = df_filtered['soup'].apply(lambda x: x.find_all('tr')[0].find('td').get_text())\n", + "\n", + " df_filtered['price_taxIncluded'] = df_filtered['soup'].apply(lambda x: x.find_all('tr')[3].find('td').get_text())\n", + "\n", + " df_filtered['genre'] = df_filtered['soup'].apply(lambda x: x.find_all('a')[3].get_text())\n", + "\n", + " df_filtered['availability'] = df_filtered['soup'].apply(lambda x: x.find_all('tr')[5].find('td').get_text())\n", + "\n", + " df_filtered['description'] = df_filtered['soup'].apply(lambda x: x.find_all('p')[3].get_text())\n", + "\n", + " columns_needed = ['title','upc','price_taxIncluded','rating','genre','availability','description']\n", + "\n", + " return df_filtered[columns_needed]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 303, + "id": "7176429a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | title | \n", + "upc | \n", + "price_taxIncluded | \n", + "rating | \n", + "genre | \n", + "availability | \n", + "description | \n", + "
---|---|---|---|---|---|---|---|
12 | \n", + "Set Me Free | \n", + "ce6396b0f23f6ecc | \n", + "£17.46 | \n", + "Five | \n", + "Young Adult | \n", + "In stock (19 available) | \n", + "Aaron Ledbetter’s future had been planned out ... | \n", + "
30 | \n", + "The Four Agreements: A Practical Guide to Pers... | \n", + "6258a1f6a6dcfe50 | \n", + "£17.66 | \n", + "Five | \n", + "Spirituality | \n", + "In stock (18 available) | \n", + "In The Four Agreements, don Miguel Ruiz reveal... | \n", + "
34 | \n", + "Sophie's World | \n", + "6be3beb0793a53e7 | \n", + "£15.94 | \n", + "Five | \n", + "Philosophy | \n", + "In stock (18 available) | \n", + "A page-turning novel that is also an explorati... | \n", + "
47 | \n", + "Untitled Collection: Sabbath Poems 2014 | \n", + "657fe5ead67a7767 | \n", + "£14.27 | \n", + "Four | \n", + "Poetry | \n", + "In stock (16 available) | \n", + "More than thirty-five years ago, when the weat... | \n", + "
53 | \n", + "This One Summer | \n", + "51653ef291ab7ddc | \n", + "£19.49 | \n", + "Four | \n", + "Sequential Art | \n", + "In stock (16 available) | \n", + "Every summer, Rose goes with her mom and dad t... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
913 | \n", + "The Zombie Room | \n", + "9c96cd1329fbd82d | \n", + "£19.69 | \n", + "Five | \n", + "Default | \n", + "In stock (1 available) | \n", + "An unlikely bond is forged between three men f... | \n", + "
917 | \n", + "The Silent Wife | \n", + "b78deb463531d078 | \n", + "£12.34 | \n", + "Five | \n", + "Fiction | \n", + "In stock (1 available) | \n", + "A chilling psychological thriller about a marr... | \n", + "
934 | \n", + "The Girl You Lost | \n", + "4280ac3eab57aa5d | \n", + "£12.29 | \n", + "Five | \n", + "Mystery | \n", + "In stock (1 available) | \n", + "Eighteen years ago your baby daughter was snat... | \n", + "
937 | \n", + "The Edge of Reason (Bridget Jones #2) | \n", + "29fc016c459aeb14 | \n", + "£19.18 | \n", + "Four | \n", + "Womens Fiction | \n", + "In stock (1 available) | \n", + "Monday 27 January“7:15 a.m. Hurrah! The wilder... | \n", + "
997 | \n", + "A Spy's Devotion (The Regency Spies of London #1) | \n", + "19fec36a1dfb4c16 | \n", + "£16.97 | \n", + "Five | \n", + "Historical Fiction | \n", + "In stock (1 available) | \n", + "In England’s Regency era, manners and elegance... | \n", + "
75 rows × 7 columns
\n", + "