diff --git a/CHANGELOG.md b/CHANGELOG.md index bf56d3ff..fa5c627a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,17 @@ + ## [1.26.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.26.0-beta.1) (2024-09-29) -### Features * add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81)) * add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) + ### Bug Fixes +* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279)) + * integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04)) * removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279)) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index dcd168f1..edab3005 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,5 +1,5 @@ """ -Research_web module +research_web module """ import re from typing import List @@ -8,10 +8,12 @@ import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", + max_results: int = 10, port: int = 8080, + timeout: int = 10) -> List[str]: """ - Searches the web for a given query using specified search engine options. + Searches the web for a given query using specified search + engine options and filters out PDF links. Args: query (str): The search query to find on the internet. @@ -19,29 +21,44 @@ def search_on_web(query: str, search_engine: str = "Google", options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. + timeout (int, optional): The number of seconds to wait + for a response from a request. Default is 10 seconds. Returns: - List[str]: A list of URLs as strings that are the search results. + List[str]: A list of URLs as strings that are the search results, excluding any PDF links. Raises: ValueError: If the search engine specified is not supported. + requests.exceptions.Timeout: If the request times out. Example: >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] """ + def filter_pdf_links(links: List[str]) -> List[str]: + """ + Filters out any links that point to PDF files. + + Args: + links (List[str]): A list of URLs as strings. + + Returns: + List[str]: A list of URLs excluding any that end with '.pdf'. + """ + return [link for link in links if not link.lower().endswith('.pdf')] + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) - return res + return filter_pdf_links(res) elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) - return links + return filter_pdf_links(links) elif search_engine.lower() == "bing": headers = { @@ -49,7 +66,7 @@ def search_on_web(query: str, search_engine: str = "Google", AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" } search_url = f"https://www.bing.com/search?q={query}" - response = requests.get(search_url, headers=headers) + response = requests.get(search_url, headers=headers, timeout=timeout) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") @@ -57,20 +74,16 @@ def search_on_web(query: str, search_engine: str = "Google", for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) - return search_results + return filter_pdf_links(search_results) elif search_engine.lower() == "searxng": url = f"http://localhost:{port}" - params = {"q": query, - "format": "json", - "engines": "google,duckduckgo,brave,qwant,bing"} - - response = requests.get(url, params=params) - + params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} + response = requests.get(url, params=params, timeout=timeout) data = response.json() - limited_results = data["results"][:max_results] - return limited_results + limited_results = [result['url'] for result in data["results"][:max_results]] + return filter_pdf_links(limited_results) else: - raise ValueError("""The only search engines available are + raise ValueError("""The only search engines available are DuckDuckGo, Google, Bing, or SearXNG""")