diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index ab38edc0..999fe42e 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -21,7 +21,8 @@ "api_key": groq_key, "temperature": 0 }, - "headless": False + "headless": False, + "backend": "undetected_chromedriver" } # ************************************************ diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index cb0cfd9a..ecc0582d 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -61,6 +61,28 @@ def __init__( self.urls = urls self.load_state = load_state + async def ascrape_undetected_chromedriver(self, url: str) -> str: + """ + Asynchronously scrape the content of a given URL using undetected chrome with Selenium. + + Args: + url (str): The URL to scrape. + + Returns: + str: The scraped HTML content or an error message if an exception occurs. + + """ + import undetected_chromedriver as uc + + logger.info(f"Starting scraping with {self.backend}...") + results = "" + try: + driver = uc.Chrome(headless=self.headless) + results = driver.get(url).page_content + except Exception as e: + results = f"Error: {e}" + return results + async def ascrape_playwright(self, url: str) -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. @@ -75,7 +97,7 @@ async def ascrape_playwright(self, url: str) -> str: from playwright.async_api import async_playwright from undetected_playwright import Malenia - logger.info("Starting scraping...") + logger.info(f"Starting scraping with {self.backend}...") results = "" async with async_playwright() as p: browser = await p.chromium.launch(