diff --git a/CHANGELOG.md b/CHANGELOG.md index bf3a3bc9..b3f0d193 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,19 @@ -## [1.22.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.5...v1.22.0-beta.6) (2024-09-28) +## [1.25.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.1...v1.25.0) (2024-09-27) + + +### Features + +* add llama 3.2 ([90e6d07](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/90e6d077dc55b498b71928181065fc088acf943e)) + +## [1.24.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.0...v1.24.1) (2024-09-26) + ### Bug Fixes +* script creator multi ([9905be8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9905be8a37dc1ff4b90fe9b8be987887253be8bd)) + +## [1.24.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.23.1...v1.24.0) (2024-09-26) * integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04)) ## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27) @@ -10,6 +21,26 @@ ### Features +* add info to the dictionary for toghtherai ([3b5ee76](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b5ee767cbb91cb0ca8e4691195d16c3b57140bb)) +* update exception ([3876cb7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3876cb7be86e081065ca18c443647261a4b205d1)) + + +### Bug Fixes + +* chat for bedrock ([f9b121f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f9b121f7657e9eaf0b1b0e4a8574b8f1cbbd7c36)) +* graph Iterator node ([8ce08ba](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8ce08baf01d7757c6fdcab0333405787c67d2dbc)) +* issue about parser ([7eda6bc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7eda6bc06bc4c32850029f54b9b4c22f3124296e)) +* node refiner + examples ([d55f6be](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d55f6bee4766f174abb2fdcd598542a9ca108a25)) +* update to pydantic documentation ([76ce257](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ce257efb9d9f46c0693472a1fe54b39e4eb1ef)) + + +### CI + +* **release:** 1.21.2-beta.1 [skip ci] ([dd0f260](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/dd0f260e75aad97019fad49b09fed1b03d755d37)) +* **release:** 1.21.2-beta.2 [skip ci] ([ba4e863](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba4e863f1448564c3446ed4bb327f0eb5df50287)) +* **release:** 1.22.0-beta.1 [skip ci] ([f42a95f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f42a95faa05de39bd9cfc05e377d4b3da372e482)) +* **release:** 1.22.0-beta.2 [skip ci] ([431c09f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431c09f551ac28581674c6061f055fde0350ed4c)) +* **release:** 1.22.0-beta.3 [skip ci] ([e5ac020](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e5ac0205d1e04a8b31e86166c3673915b70fd1e3)) * add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24)) ## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27) @@ -22,6 +53,7 @@ ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25) + ### Bug Fixes * update to pydantic documentation ([76ce257](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ce257efb9d9f46c0693472a1fe54b39e4eb1ef)) @@ -36,6 +68,7 @@ ## [1.22.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.21.2-beta.2...v1.22.0-beta.1) (2024-09-24) + ### Features * add info to the dictionary for toghtherai ([3b5ee76](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b5ee767cbb91cb0ca8e4691195d16c3b57140bb)) diff --git a/examples/ernie/deep_scraper_ernie.py b/examples/ernie/deep_scraper_ernie.py deleted file mode 100644 index b8c6501a..00000000 --- a/examples/ernie/deep_scraper_ernie.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434"}, - "verbose": True, - "max_depth": 1 -} - - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/fireworks/deep_scraper_fireworks.py b/examples/fireworks/deep_scraper_fireworks.py deleted file mode 100644 index 86fb1717..00000000 --- a/examples/fireworks/deep_scraper_fireworks.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/script_multi_generator_fireworks.py b/examples/fireworks/script_multi_generator_fireworks.py index 98671768..669f187d 100644 --- a/examples/fireworks/script_multi_generator_fireworks.py +++ b/examples/fireworks/script_multi_generator_fireworks.py @@ -26,10 +26,9 @@ # ************************************************ # Create the ScriptCreatorGraph instance and run it # ************************************************ - urls=[ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" + "https://schultzbergagency.com/emil-raste-karlsen/", + "https://schultzbergagency.com/johanna-hedberg/", ] # ************************************************ @@ -37,7 +36,8 @@ # ************************************************ script_creator_graph = ScriptCreatorMultiGraph( - prompt="Who is Marco Perini?", + prompt="Find information about actors", + # also accepts a string with the already downloaded HTML code source=urls, config=graph_config ) diff --git a/examples/mistral/deep_scraper_mistral.py b/examples/mistral/deep_scraper_mistral.py deleted file mode 100644 index bf0f6ba4..00000000 --- a/examples/mistral/deep_scraper_mistral.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/mistral/script_multi_generator_mistral.py b/examples/mistral/script_multi_generator_mistral.py index f4d5d5b5..142b5140 100644 --- a/examples/mistral/script_multi_generator_mistral.py +++ b/examples/mistral/script_multi_generator_mistral.py @@ -29,8 +29,8 @@ # ************************************************ urls=[ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" + "https://schultzbergagency.com/emil-raste-karlsen/", + "https://schultzbergagency.com/johanna-hedberg/", ] # ************************************************ @@ -38,7 +38,8 @@ # ************************************************ script_creator_graph = ScriptCreatorMultiGraph( - prompt="Who is Marco Perini?", + prompt="Find information about actors", + # also accepts a string with the already downloaded HTML code source=urls, config=graph_config ) diff --git a/examples/nemotron/deep_scraper_nemotron.py b/examples/nemotron/deep_scraper_nemotron.py deleted file mode 100644 index 35f54b38..00000000 --- a/examples/nemotron/deep_scraper_nemotron.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/nemotron/script_multi_generator_nemotron.py b/examples/nemotron/script_multi_generator_nemotron.py index ad0b4b48..c1426e85 100644 --- a/examples/nemotron/script_multi_generator_nemotron.py +++ b/examples/nemotron/script_multi_generator_nemotron.py @@ -29,8 +29,8 @@ # ************************************************ urls=[ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" + "https://schultzbergagency.com/emil-raste-karlsen/", + "https://schultzbergagency.com/johanna-hedberg/", ] # ************************************************ @@ -38,7 +38,8 @@ # ************************************************ script_creator_graph = ScriptCreatorMultiGraph( - prompt="Who is Marco Perini?", + prompt="Find information about actors", + # also accepts a string with the already downloaded HTML code source=urls, config=graph_config ) diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py deleted file mode 100644 index b20e164d..00000000 --- a/examples/openai/deep_scraper_openai.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4o", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/openai/script_multi_generator_openai.py b/examples/openai/script_multi_generator_openai.py index 3fdd029f..6693ac0f 100644 --- a/examples/openai/script_multi_generator_openai.py +++ b/examples/openai/script_multi_generator_openai.py @@ -29,8 +29,8 @@ # ************************************************ urls=[ - "https://perinim.github.io/", - "https://perinim.github.io/cv/" + "https://schultzbergagency.com/emil-raste-karlsen/", + "https://schultzbergagency.com/johanna-hedberg/", ] # ************************************************ @@ -38,7 +38,8 @@ # ************************************************ script_creator_graph = ScriptCreatorMultiGraph( - prompt="Who is Marco Perini?", + prompt="Find information about actors", + # also accepts a string with the already downloaded HTML code source=urls, config=graph_config ) diff --git a/pyproject.toml b/pyproject.toml index da9fdc9c..26b1fdb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.22.0b6" +version = "1.25.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index c9413d68..cc9e7e85 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -13,8 +13,10 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_cont - `api_key`: The API key provided by BrowserBase. - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. - `link`: The URL or link that you want to fetch data from. - - `text_content`: A boolean flag to specify whether to return only the text content (True) or the full HTML (False). - - `async_mode`: A boolean flag that determines whether the function runs asynchronously (True) or synchronously (False, default). + - `text_content`: A boolean flag to specify whether to return only the + text content (True) or the full HTML (False). + - `async_mode`: A boolean flag that determines whether the function runs asynchronously + (True) or synchronously (False, default). It initializes a Browserbase object with the given API key and project ID, then uses this object to load the specified link. @@ -37,8 +39,10 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_cont api_key (str): The API key provided by BrowserBase. project_id (str): The ID of the project on BrowserBase where you want to fetch data from. link (str): The URL or link that you want to fetch data from. - text_content (bool): Whether to return only the text content (True) or the full HTML (False). Defaults to True. - async_mode (bool): Whether to run the function asynchronously (True) or synchronously (False). Defaults to False. + text_content (bool): Whether to return only the text content + (True) or the full HTML (False). Defaults to True. + async_mode (bool): Whether to run the function asynchronously + (True) or synchronously (False). Defaults to False. Returns: object: The result of the loading operation. @@ -47,7 +51,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_cont try: from browserbase import Browserbase except ImportError: - raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") + raise ImportError(f"""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") browserbase = Browserbase(api_key=api_key, project_id=project_id) diff --git a/scrapegraphai/docloaders/scrape_do.py b/scrapegraphai/docloaders/scrape_do.py index cd9086c3..467ea0a1 100644 --- a/scrapegraphai/docloaders/scrape_do.py +++ b/scrapegraphai/docloaders/scrape_do.py @@ -24,17 +24,14 @@ def scrape_do_fetch(token, target_url, use_proxy=False, geoCode=None, super_prox """ encoded_url = urllib.parse.quote(target_url) if use_proxy: - # Create proxy mode URL - proxyModeUrl = f"http://{token}:@proxy.scrape.do:8080" + proxy_mode_url = f"http://{token}:@proxy.scrape.do:8080" proxies = { - "http": proxyModeUrl, - "https": proxyModeUrl, + "http": proxy_mode_url, + "https": proxy_mode_url, } - # Add optional geoCode and super proxy parameters if provided params = {"geoCode": geoCode, "super": str(super_proxy).lower()} if geoCode else {} response = requests.get(target_url, proxies=proxies, verify=False, params=params) else: - # API Mode URL url = f"http://api.scrape.do?token={token}&url={encoded_url}" response = requests.get(url) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index ebe914fb..efd6bd7e 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -5,7 +5,6 @@ from .abstract_graph import AbstractGraph from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph -from .deep_scraper_graph import DeepScraperGraph from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph @@ -26,4 +25,4 @@ from .search_link_graph import SearchLinkGraph from .screenshot_scraper_graph import ScreenshotScraperGraph from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph -from .code_generator_graph import CodeGeneratorGraph \ No newline at end of file +from .code_generator_graph import CodeGeneratorGraph diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index b546460f..99c13886 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -128,7 +128,7 @@ def _create_llm(self, llm_config: dict) -> object: if requests_per_second is not None: with warnings.catch_warnings(): warnings.simplefilter("ignore") - llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second) + llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second) if max_retries is not None: llm_params["max_retries"] = max_retries diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 6d160e37..05f9773c 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -45,7 +45,8 @@ class BaseGraph: ... ) """ - def __init__(self, nodes: list, edges: list, entry_point: str, use_burr: bool = False, burr_config: dict = None, graph_name: str = "Custom"): + def __init__(self, nodes: list, edges: list, entry_point: str, + use_burr: bool = False, burr_config: dict = None, graph_name: str = "Custom"): self.nodes = nodes self.raw_edges = edges self.edges = self._create_edges({e for e in edges}) diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py index 9786dc4f..c0c0f52b 100644 --- a/scrapegraphai/graphs/code_generator_graph.py +++ b/scrapegraphai/graphs/code_generator_graph.py @@ -51,7 +51,7 @@ class CodeGeneratorGraph(AbstractGraph): """ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): - + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -63,7 +63,7 @@ def _create_graph(self) -> BaseGraph: Returns: BaseGraph: A graph instance representing the web scraping workflow. """ - + if self.schema is None: raise KeyError("The schema is required for CodeGeneratorGraph") diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index eb34383e..d7854de9 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -1,7 +1,6 @@ """ Module for creating the smart scraper """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index a2eb13db..ee126e19 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ CSVScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py deleted file mode 100644 index 404fed9f..00000000 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -DeepScraperGraph Module -""" -from typing import Optional -from pydantic import BaseModel -from .base_graph import BaseGraph -from .abstract_graph import AbstractGraph -from ..nodes import ( - FetchNode, - SearchLinkNode, - ParseNode, - GenerateAnswerNode, - GraphIteratorNode, - MergeAnswersNode -) - -class DeepScraperGraph(AbstractGraph): - """ - [WIP] - - DeepScraper is a scraping pipeline that automates the process of - extracting information from web pages using a natural language model - to interpret and answer prompts. - - Unlike SmartScraper, DeepScraper can navigate to the links within, - the input webpage to fuflfil the task within the prompt. - - Attributes: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (BaseModel): The schema for the graph output. - llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. - verbose (bool): A flag indicating whether to show print statements during execution. - headless (bool): A flag indicating whether to run the graph in headless mode. - - Args: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (BaseModel): The schema for the graph output. - - Example: - >>> deep_scraper = DeepScraperGraph( - ... "List me all the job titles and detailed job description.", - ... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = deep_scraper.run() - ) - """ - - def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): - - super().__init__(prompt, config, source, schema) - - self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_repeated_graph(self) -> BaseGraph: - """ - Creates the graph that can be repeatedly executed to conduct search on - hyperlinks within the webpage. - - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - fetch_node = FetchNode( - input="url | local_dir", - output=["doc"] - ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token, - "llm_model": self.llm_model - } - ) - - generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "additional_info": self.config.get("additional_info"), - "schema": self.schema - } - ) - - search_node = SearchLinkNode( - input="user_prompt & relevant_chunks", - output=["relevant_links"], - node_config={ - "llm_model": self.llm_model, - } - ) - - graph_iterator_node = GraphIteratorNode( - input="user_prompt & relevant_links", - output=["results"], - node_config={ - "graph_instance": None, - "batchsize": 1 - } - ) - - merge_answers_node = MergeAnswersNode( - input="user_prompt & results", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "schema": self.schema - } - ) - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - search_node, - graph_iterator_node, - merge_answers_node - ], - edges=[ - (fetch_node, parse_node), - (search_node, graph_iterator_node), - (graph_iterator_node, merge_answers_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - - - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping - n-levels deep. - - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - - base_graph = self._create_repeated_graph() - graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", - base_graph.nodes))[0] - graph_iterator_node.node_config["graph_instance"] = self - return base_graph - - def run(self) -> str: - """ - Executes the scraping process and returns the answer to the prompt. - Returns: - str: The answer to the prompt. - """ - - inputs = {"user_prompt": self.prompt, self.input_key: self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - - return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 8e7c56e6..6e5434f0 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ JSONScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel @@ -46,9 +45,7 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) - self.copy_config = safe_deepcopy(config) - self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) diff --git a/scrapegraphai/graphs/markdown_scraper_multi_graph.py b/scrapegraphai/graphs/markdown_scraper_multi_graph.py index fb7d6863..1857f872 100644 --- a/scrapegraphai/graphs/markdown_scraper_multi_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_multi_graph.py @@ -1,7 +1,7 @@ """ MDScraperMultiGraph Module """ -from copy import copy, deepcopy +from copy import deepcopy from typing import List, Optional from pydantic import BaseModel from .base_graph import BaseGraph diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 500d9461..55af447f 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -1,7 +1,6 @@ """ OmniScraperGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index 4cabd18d..41d35ab7 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -1,7 +1,6 @@ """ OmniSearchGraph Module """ - from copy import deepcopy from typing import Optional from pydantic import BaseModel @@ -15,7 +14,6 @@ ) from ..utils.copy import safe_deepcopy - class OmniSearchGraph(AbstractGraph): """ OmniSearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 341243a4..65ede542 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -1,4 +1,3 @@ - """ PDFScraperGraph Module """ diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index 3b19516b..e0c56341 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ PdfScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel diff --git a/scrapegraphai/graphs/screenshot_scraper_graph.py b/scrapegraphai/graphs/screenshot_scraper_graph.py index f3ce608d..174e245a 100644 --- a/scrapegraphai/graphs/screenshot_scraper_graph.py +++ b/scrapegraphai/graphs/screenshot_scraper_graph.py @@ -47,6 +47,7 @@ def _create_graph(self) -> BaseGraph: "link": self.source } ) + generate_answer_from_image_node = GenerateAnswerFromImageNode( input="screenshots", output=["answer"], diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 732fb3cf..f6a884a1 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -68,6 +68,7 @@ def _create_graph(self) -> BaseGraph: "script_creator": True } ) + parse_node = ParseNode( input="doc", output=["parsed_doc"], @@ -76,6 +77,7 @@ def _create_graph(self) -> BaseGraph: "llm_model": self.llm_model } ) + generate_scraper_node = GenerateScraperNode( input="user_prompt & (parsed_doc)", output=["answer"], diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 864485fb..de1ab6f7 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -1,7 +1,7 @@ """ ScriptCreatorMultiGraph Module """ - +from copy import deepcopy from typing import List, Optional from pydantic import BaseModel from .base_graph import BaseGraph @@ -46,7 +46,7 @@ def __init__(self, prompt: str, source: List[str], self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) - + self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: @@ -56,19 +56,14 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - script_generator_instance = ScriptCreatorGraph( - prompt="", - source="", - config=self.copy_config, - schema=self.schema - ) - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["scripts"], node_config={ - "graph_instance": script_generator_instance, - } + "graph_instance": ScriptCreatorGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema ) merge_scripts_node = MergeGeneratedScriptsNode( diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index aa423046..8086caa6 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -1,15 +1,12 @@ """ SearchGraph Module """ - from copy import deepcopy from typing import Optional, List from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .smart_scraper_graph import SmartScraperGraph - from ..nodes import ( SearchInternetNode, GraphIteratorNode, @@ -78,6 +75,7 @@ def _create_graph(self) -> BaseGraph: "search_engine": self.copy_config.get("search_engine") } ) + graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["results"], diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index c44d707a..9df04871 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -6,9 +6,11 @@ from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) +from ..nodes import (FetchNode, + SearchLinkNode, + SearchLinksWithContext) -class SearchLinkGraph(AbstractGraph): +class SearchLinkGraph(AbstractGraph): """ SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model @@ -30,13 +32,7 @@ class SearchLinkGraph(AbstractGraph): config (dict): Configuration parameters for the graph. schema (BaseModel, optional): The schema for the graph output. Defaults to None. - Example: - >>> smart_scraper = SearchLinkGraph( - ... "List me all the attractions in Chioggia.", - ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = smart_scraper.run() + """ def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): @@ -51,45 +47,41 @@ def _create_graph(self) -> BaseGraph: Returns: BaseGraph: A graph instance representing the web scraping workflow. """ - fetch_node = FetchNode( - input="url| local_dir", - output=["doc"], - node_config={ - "llm_model": self.llm_model, - "force": self.config.get("force", False), - "cut": self.config.get("cut", True), - "loader_kwargs": self.config.get("loader_kwargs", {}), - } - ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token, - "llm_model": self.llm_model - } - ) - search_link_node = SearchLinkNode( - input="doc", - output=["parsed_doc"], - node_config={ - "llm_model": self.llm_model, - "chunk_size": self.model_token, - "filter_links": self.config.get("filter_links", None), - "filter_config": self.config.get("filter_config", None) - } - ) + input="url| local_dir", + output=["doc"], + node_config={ + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + + if self.config.get("llm_style") == (True, None): + search_link_node = SearchLinksWithContext( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token, + } + ) + else: + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token, + } + ) return BaseGraph( nodes=[ fetch_node, - parse_node, search_link_node ], edges=[ - (fetch_node, parse_node), - (parse_node, search_link_node) + (fetch_node, search_link_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 65f03a24..60407624 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -70,6 +70,14 @@ def _create_graph(self) -> BaseGraph: "scrape_do": self.config.get("scrape_do") } ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py index 2097e1ca..ce879317 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -1,7 +1,6 @@ """ SmartScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel @@ -14,7 +13,6 @@ ) from ..utils.copy import safe_deepcopy - class SmartScraperMultiConcatGraph(AbstractGraph): """ SmartScraperMultiGraph is a scraping pipeline that scrapes a @@ -43,9 +41,8 @@ class SmartScraperMultiConcatGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index be81fab4..5dff3277 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ SmartScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel @@ -46,9 +45,7 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) - self.copy_config = safe_deepcopy(config) - self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) @@ -61,13 +58,6 @@ def _create_graph(self) -> BaseGraph: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - # smart_scraper_instance = SmartScraperGraph( - # prompt="", - # source="", - # config=self.copy_config, - # schema=self.copy_schema - # ) - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", output=["results"], diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 6065bcf4..d491d4bc 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -61,6 +61,7 @@ def _create_graph(self) -> BaseGraph: input="url | local_dir", output=["doc"] ) + parse_node = ParseNode( input="doc", output=["parsed_doc"], @@ -69,6 +70,7 @@ def _create_graph(self) -> BaseGraph: "llm_model": self.llm_model } ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -78,6 +80,7 @@ def _create_graph(self) -> BaseGraph: "schema": self.schema } ) + text_to_speech_node = TextToSpeechNode( input="answer", output=["audio"], diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index ec75aee9..502ea99f 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -1,7 +1,6 @@ """ XMLScraperGraph Module """ - from typing import Optional from pydantic import BaseModel from .base_graph import BaseGraph diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index 3ff79fad..10887c6b 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -1,7 +1,6 @@ """ XMLScraperMultiGraph Module """ - from copy import deepcopy from typing import List, Optional from pydantic import BaseModel @@ -46,9 +45,7 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.copy_config = safe_deepcopy(config) - self.copy_schema = deepcopy(schema) - super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: @@ -58,14 +55,6 @@ def _create_graph(self) -> BaseGraph: Returns: BaseGraph: A graph instance representing the web scraping and searching workflow. """ - - # smart_scraper_instance = XMLScraperGraph( - # prompt="", - # source="", - # config=self.copy_config, - # schema=self.copy_schema - # ) - graph_iterator_node = GraphIteratorNode( input="user_prompt & jsons", output=["results"], diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 2da600ee..113d1636 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -76,6 +76,8 @@ "llama3.1:8b": 128000, "llama3.1:70b": 128000, "lama3.1:405b": 128000, + "llama3.2": 128000, + "llama3.2:1b": 128000, "scrapegraph": 8192, "mistral": 8192, "grok-1": 8192, diff --git a/scrapegraphai/integrations/burr_bridge.py b/scrapegraphai/integrations/burr_bridge.py index 76cce914..959634bb 100644 --- a/scrapegraphai/integrations/burr_bridge.py +++ b/scrapegraphai/integrations/burr_bridge.py @@ -15,7 +15,8 @@ from burr.core import Application, ApplicationBuilder, State, Action, default, ApplicationContext from burr.lifecycle import PostRunStepHook, PreRunStepHook except ImportError: - raise ImportError("burr package is not installed. Please install it with 'pip install scrapegraphai[burr]'") + raise ImportError("""burr package is not installed. + Please install it with 'pip install scrapegraphai[burr]'""") class PrintLnHook(PostRunStepHook, PreRunStepHook): @@ -32,7 +33,8 @@ def post_run_step(self, *, state: "State", action: "Action", **future_kwargs: An class BurrNodeBridge(Action): """Bridge class to convert a base graph node to a Burr action. - This is nice because we can dynamically declare the inputs/outputs (and not rely on function-parsing). + This is nice because we can dynamically declare + the inputs/outputs (and not rely on function-parsing). """ def __init__(self, node): @@ -63,7 +65,8 @@ def get_source(self) -> str: def parse_boolean_expression(expression: str) -> List[str]: """ - Parse a boolean expression to extract the keys used in the expression, without boolean operators. + Parse a boolean expression to extract the keys + used in the expression, without boolean operators. Args: expression (str): The boolean expression to parse. @@ -136,10 +139,9 @@ def _initialize_burr_app(self, initial_state: Dict[str, Any] = None) -> Applicat if application_context is not None: builder = ( builder - # if we're using a tracker, we want to copy it/pass in .with_tracker( application_context.tracker.copy() if application_context.tracker is not None else None - ) # remember to do `copy()` here! + ) .with_spawning_parent( application_context.app_id, application_context.sequence_id, @@ -157,7 +159,8 @@ def _create_actions(self) -> Dict[str, Any]: Create Burr actions from the base graph nodes. Returns: - dict: A dictionary of Burr actions with the node name as keys and the action functions as values. + dict: A dictionary of Burr actions with the node name + as keys and the action functions as values. """ actions = {} diff --git a/scrapegraphai/integrations/indexify_node.py b/scrapegraphai/integrations/indexify_node.py index e12adc69..cf15cd0e 100644 --- a/scrapegraphai/integrations/indexify_node.py +++ b/scrapegraphai/integrations/indexify_node.py @@ -3,16 +3,9 @@ """ from typing import List, Optional - from ..utils.logging import get_logger from ..nodes.base_node import BaseNode -# try: -# import indexify -# except ImportError: -# raise ImportError("indexify package is not installed. Please install it with 'pip install scrapegraphai[indexify]'") - - class IndexifyNode(BaseNode): """ A node responsible for indexing the content present in the state. @@ -61,7 +54,7 @@ def execute(self, state: dict) -> dict: # Interpret input keys based on the provided input expression # input_keys length matches the min_input_len parameter in the __init__ method # e.g. "answer & parsed_doc" or "answer | img_urls" - + input_keys = self.get_input_keys(state) # Fetching data from the state based on the input keys diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 7ed99808..ec16c48e 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -26,4 +26,5 @@ from .prompt_refiner_node import PromptRefinerNode from .html_analyzer_node import HtmlAnalyzerNode from .generate_code_node import GenerateCodeNode +from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index 90dbea51..8ba55452 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -1,13 +1,11 @@ """ BaseNode Module """ - import re from abc import ABC, abstractmethod from typing import List, Optional from ..utils import get_logger - class BaseNode(ABC): """ An abstract base class for nodes in a graph-based workflow, @@ -194,18 +192,14 @@ def _parse_input_keys(self, state: dict, expression: str) -> List[str]: "Invalid operator placement: operators cannot be adjacent." ) - # Check for missing or balanced parentheses if open_parentheses != close_parentheses: raise ValueError("Missing or unbalanced parentheses in expression.") - # Helper function to evaluate an expression without parentheses def evaluate_simple_expression(exp: str) -> List[str]: """Evaluate an expression without parentheses.""" - # Split the expression by the OR operator and process each segment for or_segment in exp.split("|"): - # Check if all elements in an AND segment are in state and_segment = or_segment.split("&") if all(elem.strip() in state for elem in and_segment): return [ @@ -213,7 +207,6 @@ def evaluate_simple_expression(exp: str) -> List[str]: ] return [] - # Helper function to evaluate expressions with parentheses def evaluate_expression(expression: str) -> List[str]: """Evaluate an expression with parentheses.""" @@ -222,10 +215,8 @@ def evaluate_expression(expression: str) -> List[str]: end = expression.find(")", start) sub_exp = expression[start + 1 : end] - # Replace the evaluated part with a placeholder and then evaluate it sub_result = evaluate_simple_expression(sub_exp) - # For simplicity in handling, join sub-results with OR to reprocess them later expression = ( expression[:start] + "|".join(sub_result) + expression[end + 1 :] ) @@ -238,7 +229,6 @@ def evaluate_expression(expression: str) -> List[str]: Expression was {expression}. State contains keys: {', '.join(state.keys())}""") - # Remove redundant state keys from the result, without changing their order final_result = [] for key in result: if key not in final_result: diff --git a/scrapegraphai/nodes/concat_answers_node.py b/scrapegraphai/nodes/concat_answers_node.py index 5af81702..438218b5 100644 --- a/scrapegraphai/nodes/concat_answers_node.py +++ b/scrapegraphai/nodes/concat_answers_node.py @@ -1,7 +1,6 @@ """ ConcatAnswersNode Module """ - from typing import List, Optional from ..utils.logging import get_logger from .base_node import BaseNode @@ -57,19 +56,15 @@ def execute(self, state: dict) -> dict: self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] answers = input_data[0] - + if len(answers) > 1: - # merge the answers in one string answer = self._merge_dict(answers) - # Update the state with the generated answer state.update({self.output[0]: answer}) else: diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index aa72a4b1..4aabce5d 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -32,7 +32,6 @@ def __init__(self): """ Initializes an empty ConditionalNode. """ - #super().__init__(node_name, "node", input, output, 2, node_config) pass diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 053a655b..a548e05b 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -219,7 +219,9 @@ def handle_local_source(self, state, source): parsed_content = source - if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator: + if (isinstance(self.llm_model, ChatOpenAI) or \ + isinstance(self.llm_model, AzureChatOpenAI)) \ + and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) else: parsed_content = source diff --git a/scrapegraphai/nodes/fetch_screen_node.py b/scrapegraphai/nodes/fetch_screen_node.py index 0bb71c37..1b605b86 100644 --- a/scrapegraphai/nodes/fetch_screen_node.py +++ b/scrapegraphai/nodes/fetch_screen_node.py @@ -16,7 +16,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "FetchScreenNode", + node_name: str = "FetchScreen", ): super().__init__(node_name, "node", input, output, 2, node_config) self.url = node_config.get("link") diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 85593cfa..0419d891 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -1,7 +1,6 @@ """ Module for generating the answer node """ - from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -82,16 +81,13 @@ def execute(self, state): self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] user_prompt = input_data[0] doc = input_data[1] - # Initialize the output parser if self.node_config.get("schema", None) is not None: if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 2824a573..403240dd 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -81,7 +81,6 @@ def execute(self, state: dict) -> dict: doc = input_data[1] imag_desc = input_data[2] - # Initialize the output parser if self.node_config.get("schema", None) is not None: if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index bcb7ea74..cc72aaf4 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -2,19 +2,20 @@ GenerateCodeNode Module """ from typing import Any, Dict, List, Optional -from langchain.prompts import PromptTemplate -from langchain.output_parsers import ResponseSchema, StructuredOutputParser -from langchain_core.output_parsers import StrOutputParser -from langchain_community.chat_models import ChatOllama import ast import sys from io import StringIO -from bs4 import BeautifulSoup import re import json -from jsonschema import validate, ValidationError from pydantic import ValidationError -from .base_node import BaseNode +from langchain.prompts import PromptTemplate +from langchain.output_parsers import ResponseSchema, StructuredOutputParser +from langchain_core.output_parsers import StrOutputParser +from langchain_community.chat_models import ChatOllama +from bs4 import BeautifulSoup +from ..prompts import ( + TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON +) from ..utils import (transform_schema, extract_code, syntax_focused_analysis, syntax_focused_code_generation, @@ -22,13 +23,14 @@ validation_focused_analysis, validation_focused_code_generation, semantic_focused_analysis, semantic_focused_code_generation, are_content_equal) -from ..prompts import ( - TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON -) +from .base_node import BaseNode +from jsonschema import validate, ValidationError + class GenerateCodeNode(BaseNode): """ - A node that generates Python code for a function that extracts data from HTML based on a output schema. + A node that generates Python code for a function that extracts data + from HTML based on a output schema. Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -69,7 +71,7 @@ def __init__( ) self.additional_info = node_config.get("additional_info") - + self.max_iterations = node_config.get("max_iterations", { "overall": 10, "syntax": 3, @@ -77,7 +79,7 @@ def __init__( "validation": 3, "semantic": 3 }) - + self.output_schema = node_config.get("schema") def execute(self, state: dict) -> dict: @@ -94,25 +96,26 @@ def execute(self, state: dict) -> dict: Raises: KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. - RuntimeError: If the maximum number of iterations is reached without obtaining the desired code. + RuntimeError: If the maximum number of iterations is + reached without obtaining the desired code. """ - + self.logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) - + input_data = [state[key] for key in input_keys] - + user_prompt = input_data[0] refined_prompt = input_data[1] html_info = input_data[2] reduced_html = input_data[3] - answer = input_data[4] - + answer = input_data[4] + self.raw_html = state['original_html'][0].page_content - + simplefied_schema = str(transform_schema(self.output_schema.schema())) - + reasoning_state = { "user_input": user_prompt, "json_schema": simplefied_schema, @@ -130,89 +133,103 @@ def execute(self, state: dict) -> dict: }, "iteration": 0 } - - + final_state = self.overall_reasoning_loop(reasoning_state) - + state.update({self.output[0]: final_state["generated_code"]}) return state - + def overall_reasoning_loop(self, state: dict) -> dict: + """ + overrall_reasoning_loop + """ self.logger.info(f"--- (Generating Code) ---") state["generated_code"] = self.generate_initial_code(state) state["generated_code"] = extract_code(state["generated_code"]) - + while state["iteration"] < self.max_iterations["overall"]: state["iteration"] += 1 if self.verbose: self.logger.info(f"--- Iteration {state['iteration']} ---") - + self.logger.info(f"--- (Checking Code Syntax) ---") state = self.syntax_reasoning_loop(state) if state["errors"]["syntax"]: continue - + self.logger.info(f"--- (Executing the Generated Code) ---") state = self.execution_reasoning_loop(state) if state["errors"]["execution"]: continue - + self.logger.info(f"--- (Validate the Code Output Schema) ---") state = self.validation_reasoning_loop(state) if state["errors"]["validation"]: continue - + self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---") state = self.semantic_comparison_loop(state) if state["errors"]["semantic"]: - continue + continue break - - if state["iteration"] == self.max_iterations["overall"] and (state["errors"]["syntax"] or state["errors"]["execution"] or state["errors"]["validation"] or state["errors"]["semantic"]): + + if state["iteration"] == self.max_iterations["overall"] and \ + (state["errors"]["syntax"] or state["errors"]["execution"] \ + or state["errors"]["validation"] or state["errors"]["semantic"]): raise RuntimeError("Max iterations reached without obtaining the desired code.") - + self.logger.info(f"--- (Code Generated Correctly) ---") - + return state - + def syntax_reasoning_loop(self, state: dict) -> dict: + """ + syntax reasoning loop + """ for _ in range(self.max_iterations["syntax"]): syntax_valid, syntax_message = self.syntax_check(state["generated_code"]) if syntax_valid: state["errors"]["syntax"] = [] return state - + state["errors"]["syntax"] = [syntax_message] self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---") analysis = syntax_focused_analysis(state, self.llm_model) - self.logger.info(f"--- (Regenerating Code to fix the Error) ---") - state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model) + self.logger.info(f"""--- (Regenerating Code + to fix the Error) ---""") + state["generated_code"] = syntax_focused_code_generation(state, + analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state - + def execution_reasoning_loop(self, state: dict) -> dict: + """ + execution of the reasoning loop + """ for _ in range(self.max_iterations["execution"]): execution_success, execution_result = self.create_sandbox_and_execute(state["generated_code"]) if execution_success: state["execution_result"] = execution_result state["errors"]["execution"] = [] return state - + state["errors"]["execution"] = [execution_result] self.logger.info(f"--- (Code Execution Error: {execution_result}) ---") analysis = execution_focused_analysis(state, self.llm_model) self.logger.info(f"--- (Regenerating Code to fix the Error) ---") - state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model) + state["generated_code"] = execution_focused_code_generation(state, + analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state - + def validation_reasoning_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["validation"]): - validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema()) + validation, errors = self.validate_dict(state["execution_result"], + self.output_schema.schema()) if validation: state["errors"]["validation"] = [] return state - + state["errors"]["validation"] = errors self.logger.info(f"--- (Code Output not compliant to the deisred Output Schema) ---") analysis = validation_focused_analysis(state, self.llm_model) @@ -220,14 +237,15 @@ def validation_reasoning_loop(self, state: dict) -> dict: state["generated_code"] = validation_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state - + def semantic_comparison_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["semantic"]): - comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"]) + comparison_result = self.semantic_comparison(state["execution_result"], + state["reference_answer"]) if comparison_result["are_semantically_equivalent"]: state["errors"]["semantic"] = [] return state - + state["errors"]["semantic"] = comparison_result["differences"] self.logger.info(f"--- (The informations exctrcated are not the all ones requested) ---") analysis = semantic_focused_analysis(state, comparison_result, self.llm_model) @@ -235,8 +253,11 @@ def semantic_comparison_loop(self, state: dict) -> dict: state["generated_code"] = semantic_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state - + def generate_initial_code(self, state: dict) -> str: + """ + function for generating the initial code + """ prompt = PromptTemplate( template=TEMPLATE_INIT_CODE_GENERATION, partial_variables={ @@ -252,22 +273,29 @@ def generate_initial_code(self, state: dict) -> str: chain = prompt | self.llm_model | output_parser generated_code = chain.invoke({}) return generated_code - + def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]: + """ + semtantic comparison formula + """ reference_result_dict = self.output_schema(**reference_result).dict() - - # Check if generated result and reference result are actually equal if are_content_equal(generated_result, reference_result_dict): return { "are_semantically_equivalent": True, "differences": [], "explanation": "The generated result and reference result are exactly equal." } - + response_schemas = [ - ResponseSchema(name="are_semantically_equivalent", description="Boolean indicating if the results are semantically equivalent"), - ResponseSchema(name="differences", description="List of semantic differences between the results, if any"), - ResponseSchema(name="explanation", description="Detailed explanation of the comparison and reasoning") + ResponseSchema(name="are_semantically_equivalent", + description="""Boolean indicating if the + results are semantically equivalent"""), + ResponseSchema(name="differences", + description="""List of semantic differences + between the results, if any"""), + ResponseSchema(name="explanation", + description="""Detailed explanation of the + comparison and reasoning""") ] output_parser = StructuredOutputParser.from_response_schemas(response_schemas) @@ -282,8 +310,11 @@ def semantic_comparison(self, generated_result: Any, reference_result: Any) -> D "generated_result": json.dumps(generated_result, indent=2), "reference_result": json.dumps(reference_result_dict, indent=2) }) - + def syntax_check(self, code): + """ + syntax checker + """ try: ast.parse(code) return True, "Syntax is correct." @@ -291,36 +322,40 @@ def syntax_check(self, code): return False, f"Syntax error: {str(e)}" def create_sandbox_and_execute(self, function_code): - # Create a sandbox environment + """ + Create a sandbox environment + """ sandbox_globals = { 'BeautifulSoup': BeautifulSoup, 're': re, '__builtins__': __builtins__, } - + old_stdout = sys.stdout sys.stdout = StringIO() - + try: exec(function_code, sandbox_globals) - + extract_data = sandbox_globals.get('extract_data') - + if not extract_data: raise NameError("Function 'extract_data' not found in the generated code.") - - result = extract_data(self.raw_html) - + + result = extract_data(self.raw_html) return True, result except Exception as e: return False, f"Error during execution: {str(e)}" finally: sys.stdout = old_stdout - + def validate_dict(self, data: dict, schema): + """ + validate_dict method + """ try: validate(instance=data, schema=schema) return True, None except ValidationError as e: errors = e.errors() - return False, errors \ No newline at end of file + return False, errors diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index f7fd944f..25e704ad 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -4,8 +4,8 @@ import asyncio from typing import List, Optional from tqdm.asyncio import tqdm -from .base_node import BaseNode from pydantic import BaseModel +from .base_node import BaseNode DEFAULT_BATCHSIZE = 16 diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 13113aa0..00c71e93 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -62,10 +62,9 @@ def execute(self, state: dict) -> dict: elif len(urls) == 0: return state.update({self.output[0]: []}) - # Skip the image-to-text conversion if self.max_images < 1: return state.update({self.output[0]: []}) - + img_desc = [] for url in urls[: self.max_images]: try: diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 1f919926..fd2f3810 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -33,7 +33,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "Parse", + node_name: str = "ParseNode", ): super().__init__(node_name, "node", input, output, 1, node_config) @@ -88,7 +88,7 @@ def execute(self, state: dict) -> dict: link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) chunk_size = self.chunk_size - chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) + chunk_size = min(chunk_size - 500, int(chunk_size * 0.75)) if isinstance(docs_transformed, Document): chunks = split_text_into_chunks(text=docs_transformed.page_content, @@ -118,7 +118,7 @@ def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: """ if not self.parse_urls: return [], [] - + image_extensions = default_filters.filter_dict["img_exts"] image_extension_seq = '|'.join(image_extensions).replace('.','') url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') @@ -130,12 +130,12 @@ def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: all_urls = [url for url in all_urls if url.startswith("http")] else: all_urls = [urljoin(source, url) for url in all_urls] - + images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] links = [url for url in all_urls if url not in images] return links, images - + def _clean_urls(self, urls: List[str]) -> List[str]: """ Cleans the URLs extracted from the text. @@ -150,7 +150,7 @@ def _clean_urls(self, urls: List[str]) -> List[str]: for url in urls: url = re.sub(r'.*?\]\(', '', url) url = url.rstrip(').') - + cleaned_urls.append(url) - - return cleaned_urls \ No newline at end of file + + return cleaned_urls diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py index 7cc53020..66c960ff 100644 --- a/scrapegraphai/nodes/prompt_refiner_node.py +++ b/scrapegraphai/nodes/prompt_refiner_node.py @@ -56,7 +56,7 @@ def __init__( ) self.additional_info = node_config.get("additional_info") - + self.output_schema = node_config.get("schema") def execute(self, state: dict) -> dict: @@ -80,7 +80,7 @@ def execute(self, state: dict) -> dict: user_prompt = state['user_prompt'] self.simplefied_schema = transform_schema(self.output_schema.schema()) - + if self.additional_info is not None: prompt = PromptTemplate( template=TEMPLATE_REFINER_WITH_CONTEXT, diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 974fa772..1174beee 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -13,8 +13,8 @@ from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS from langchain_community.chat_models import ChatOllama -from langchain_aws import BedrockEmbeddings, ChatBedrock from langchain_community.embeddings import OllamaEmbeddings +from langchain_aws import BedrockEmbeddings, ChatBedrock from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI from ..utils.logging import get_logger @@ -22,7 +22,8 @@ from ..helpers import models_tokens from ..models import DeepSeek -optional_modules = {"langchain_anthropic", "langchain_fireworks", "langchain_groq", "langchain_google_vertexai"} +optional_modules = {"langchain_anthropic", "langchain_fireworks", + "langchain_groq", "langchain_google_vertexai"} class RAGNode(BaseNode): """ @@ -60,96 +61,8 @@ def __init__( self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: - """ - Executes the node's logic to implement RAG (Retrieval-Augmented Generation). - The method updates the state with relevant chunks of the document. - - Args: - state (dict): The current state of the graph. The input keys will be used to fetch the - correct data from the state. - - Returns: - dict: The updated state with the output key containing the relevant chunks of the document. - - Raises: - KeyError: If the input keys are not found in the state, indicating that the - necessary information for compressing the content is missing. - """ - - self.logger.info(f"--- Executing {self.node_name} Node ---") - - input_keys = self.get_input_keys(state) - - input_data = [state[key] for key in input_keys] - - user_prompt = input_data[0] - doc = input_data[1] - - chunked_docs = [] - - for i, chunk in enumerate(doc): - doc = Document( - page_content=chunk, - metadata={ - "chunk": i + 1, - }, - ) - chunked_docs.append(doc) - - self.logger.info("--- (updated chunks metadata) ---") - - if self.embedder_model is not None: - embeddings = self.embedder_model - elif 'embeddings' in self.node_config: - try: - embeddings = self._create_embedder(self.node_config['embedder_config']) - except Exception: - try: - embeddings = self._create_default_embedder() - self.embedder_model = embeddings - except ValueError: - embeddings = self.llm_model - self.embedder_model = self.llm_model - else: - embeddings = self.llm_model - self.embedder_model = self.llm_model - - folder_name = self.node_config.get("cache_path", "cache") - - if self.node_config.get("cache_path", False) and not os.path.exists(folder_name): - index = FAISS.from_documents(chunked_docs, embeddings) - os.makedirs(folder_name) - index.save_local(folder_name) - self.logger.info("--- (indexes saved to cache) ---") - - elif self.node_config.get("cache_path", False) and os.path.exists(folder_name): - index = FAISS.load_local(folder_path=folder_name, - embeddings=embeddings, - allow_dangerous_deserialization=True) - self.logger.info("--- (indexes loaded from cache) ---") - - else: - index = FAISS.from_documents(chunked_docs, embeddings) - - retriever = index.as_retriever() - - redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings) - # similarity_threshold could be set, now k=20 - relevant_filter = EmbeddingsFilter(embeddings=embeddings) - pipeline_compressor = DocumentCompressorPipeline( - transformers=[redundant_filter, relevant_filter] - ) - compression_retriever = ContextualCompressionRetriever( - base_compressor=pipeline_compressor, base_retriever=retriever - ) - - compressed_docs = compression_retriever.invoke(user_prompt) - - self.logger.info("--- (tokens compressed and vector stored) ---") - - state.update({self.output[0]: compressed_docs}) - return state - + # Execution logic + pass def _create_default_embedder(self, llm_config=None) -> object: """ @@ -176,9 +89,7 @@ def _create_default_embedder(self, llm_config=None) -> object: elif isinstance(self.llm_model, AzureChatOpenAI): return AzureOpenAIEmbeddings() elif isinstance(self.llm_model, ChatOllama): - # unwrap the kwargs from the model whihc is a dict params = self.llm_model._lc_kwargs - # remove streaming and temperature params.pop("streaming", None) params.pop("temperature", None) return OllamaEmbeddings(**params) @@ -186,17 +97,20 @@ def _create_default_embedder(self, llm_config=None) -> object: return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) elif all(key in sys.modules for key in optional_modules): if isinstance(self.llm_model, ChatFireworks): + from langchain_fireworks import FireworksEmbeddings return FireworksEmbeddings(model=self.llm_model.model_name) if isinstance(self.llm_model, ChatNVIDIA): + from langchain_nvidia import NVIDIAEmbeddings return NVIDIAEmbeddings(model=self.llm_model.model_name) if isinstance(self.llm_model, ChatHuggingFace): + from langchain_huggingface import HuggingFaceEmbeddings return HuggingFaceEmbeddings(model=self.llm_model.model) if isinstance(self.llm_model, ChatVertexAI): + from langchain_vertexai import VertexAIEmbeddings return VertexAIEmbeddings() else: raise ValueError("Embedding Model missing or not supported") - def _create_embedder(self, embedder_config: dict) -> object: """ Create an embedding model instance based on the configuration provided. @@ -240,20 +154,23 @@ def _create_embedder(self, embedder_config: dict) -> object: return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) if all(key in sys.modules for key in optional_modules): if "hugging_face" in embedder_params["model"]: + from langchain_huggingface import HuggingFaceEmbeddings embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["hugging_face"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return HuggingFaceEmbeddings(model=embedder_params["model"]) - if "fireworks" in embedder_params["model"]: + elif "fireworks" in embedder_params["model"]: + from langchain_fireworks import FireworksEmbeddings embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["fireworks"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return FireworksEmbeddings(model=embedder_params["model"]) - if "nvidia" in embedder_params["model"]: + elif "nvidia" in embedder_params["model"]: + from langchain_nvidia import NVIDIAEmbeddings embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["nvidia"][embedder_params["model"]] diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 034599ea..10907850 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -3,8 +3,8 @@ """ from typing import List, Optional import re -from tqdm import tqdm from urllib.parse import urlparse, parse_qs +from tqdm import tqdm from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel @@ -13,7 +13,6 @@ from ..prompts import TEMPLATE_RELEVANT_LINKS from ..helpers import default_filters - class SearchLinkNode(BaseNode): """ A node that can filter out the relevant links in the webpage content for the user prompt. @@ -36,12 +35,10 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "GenerateLinks", + node_name: str = "SearchLinks", ): super().__init__(node_name, "node", input, output, 1, node_config) - self.llm_model = node_config["llm_model"] - if node_config.get("filter_links", False) or "filter_config" in node_config: provided_filter_config = node_config.get("filter_config", {}) self.filter_config = {**default_filters.filter_dict, **provided_filter_config} @@ -74,10 +71,11 @@ def _is_language_url(self, url): parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) - return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators) + return any(indicator in parsed_url.path.lower() \ + or indicator in query_params for indicator in lang_indicators) def _is_potentially_irrelevant(self, url): if not self.filter_links: - return False + return False irrelevant_keywords = self.filter_config.get("irrelevant_keywords", []) return any(keyword in url.lower() for keyword in irrelevant_keywords) diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py index 7343b64c..5a21010c 100644 --- a/scrapegraphai/nodes/search_node_with_context.py +++ b/scrapegraphai/nodes/search_node_with_context.py @@ -23,7 +23,7 @@ class SearchLinksWithContext(BaseNode): input (str): Boolean expression defining the input keys needed from the state. output (List[str]): List of output keys to be updated in the state. node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + node_name (str): The unique identifier name for the node, defaulting to "SearchLinksWithContext". """ def __init__( @@ -31,7 +31,7 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "GenerateAnswer", + node_name: str = "SearchLinksWithContext", ): super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] diff --git a/scrapegraphai/prompts/generate_answer_node_csv_prompts.py b/scrapegraphai/prompts/generate_answer_node_csv_prompts.py index 8df8bde3..b131890e 100644 --- a/scrapegraphai/prompts/generate_answer_node_csv_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_csv_prompts.py @@ -1,6 +1,7 @@ """ Generate answer csv schema """ + TEMPLATE_CHUKS_CSV = """ You are a scraper and you have just scraped the following content from a csv. diff --git a/scrapegraphai/prompts/generate_answer_node_pdf_prompts.py b/scrapegraphai/prompts/generate_answer_node_pdf_prompts.py index a3ea9bef..04472bfa 100644 --- a/scrapegraphai/prompts/generate_answer_node_pdf_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_pdf_prompts.py @@ -1,6 +1,7 @@ """ Generate anwer node pdf prompt """ + TEMPLATE_CHUNKS_PDF = """ You are a scraper and you have just scraped the following content from a PDF. diff --git a/scrapegraphai/prompts/generate_code_node_prompts.py b/scrapegraphai/prompts/generate_code_node_prompts.py index eab92ee4..35d4f786 100644 --- a/scrapegraphai/prompts/generate_code_node_prompts.py +++ b/scrapegraphai/prompts/generate_code_node_prompts.py @@ -2,7 +2,6 @@ Generate code prompts helper """ - TEMPLATE_INIT_CODE_GENERATION = """ **Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema. diff --git a/scrapegraphai/prompts/html_analyzer_node_prompts.py b/scrapegraphai/prompts/html_analyzer_node_prompts.py index d7e6e342..97961047 100644 --- a/scrapegraphai/prompts/html_analyzer_node_prompts.py +++ b/scrapegraphai/prompts/html_analyzer_node_prompts.py @@ -2,7 +2,6 @@ HTML analysis prompts helper """ - TEMPLATE_HTML_ANALYSIS = """ Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string. diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index 61af900c..91073e28 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -14,6 +14,7 @@ or: export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false """ + import configparser import functools import importlib.metadata diff --git a/scrapegraphai/utils/cleanup_code.py b/scrapegraphai/utils/cleanup_code.py index 9bf91e62..ad3d437b 100644 --- a/scrapegraphai/utils/cleanup_code.py +++ b/scrapegraphai/utils/cleanup_code.py @@ -5,7 +5,7 @@ def extract_code(code: str) -> str: pattern = r'```(?:python)?\n(.*?)```' - + match = re.search(pattern, code, re.DOTALL) - - return match.group(1) if match else code \ No newline at end of file + + return match.group(1) if match else code diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 1521fe01..832f811e 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -57,22 +57,17 @@ def cleanup_html(html_content: str, base_url: str) -> str: def minify_html(html): - # Remove comments + """ + minify_html function + """ html = re.sub(r'', '', html, flags=re.DOTALL) - - # Remove whitespace between tags + html = re.sub(r'>\s+<', '><', html) - - # Remove whitespace at the beginning and end of tags html = re.sub(r'\s+>', '>', html) html = re.sub(r'<\s+', '<', html) - - # Collapse multiple whitespace characters into a single space html = re.sub(r'\s+', ' ', html) - - # Remove spaces around equals signs in attributes html = re.sub(r'\s*=\s*', '=', html) - + return html.strip() def reduce_html(html, reduction): @@ -84,52 +79,45 @@ def reduce_html(html, reduction): reduction (int): The level of reduction to apply to the HTML content. 0: minification only, 1: minification and removig unnecessary tags and attributes, - 2: minification, removig unnecessary tags and attributes, simplifying text content, removing of the head tag + 2: minification, removig unnecessary tags and attributes, + simplifying text content, removing of the head tag Returns: str: The reduced HTML content based on the specified reduction level. """ if reduction == 0: return minify_html(html) - + soup = BeautifulSoup(html, 'html.parser') - - # Remove comments + for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() - - # Remove script and style tag contents, but keep the tags + for tag in soup(['script', 'style']): tag.string = "" - - # Remove unnecessary attributes, but keep class and id + attrs_to_keep = ['class', 'id', 'href', 'src'] for tag in soup.find_all(True): for attr in list(tag.attrs): if attr not in attrs_to_keep: del tag[attr] - + if reduction == 1: return minify_html(str(soup)) - - # Remove script and style tags completely + for tag in soup(['script', 'style']): tag.decompose() - - # Focus only on the body + body = soup.body if not body: return "No tag found in the HTML" - - # Simplify text content + for tag in body.find_all(string=True): if tag.parent.name not in ['script', 'style']: tag.replace_with(re.sub(r'\s+', ' ', tag.strip())[:20]) - - # Generate reduced HTML + reduced_html = str(body) - - # Apply minification + reduced_html = minify_html(reduced_html) - - return reduced_html \ No newline at end of file + + return reduced_html diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py index fba7e005..ac955502 100644 --- a/scrapegraphai/utils/code_error_analysis.py +++ b/scrapegraphai/utils/code_error_analysis.py @@ -45,4 +45,4 @@ def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], ll "generated_code": state["generated_code"], "differences": json.dumps(comparison_result["differences"], indent=2), "explanation": comparison_result["explanation"] - }) \ No newline at end of file + }) diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py index 276c7a62..52e92e4c 100644 --- a/scrapegraphai/utils/code_error_correction.py +++ b/scrapegraphai/utils/code_error_correction.py @@ -1,9 +1,9 @@ """ This module contains the code generation functions for code correction for different types errors. """ +import json from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -import json from ..prompts import ( TEMPLATE_SYNTAX_CODE_GENERATION, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_CODE_GENERATION @@ -42,4 +42,4 @@ def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> s "generated_code": state["generated_code"], "generated_result": json.dumps(state["execution_result"], indent=2), "reference_result": json.dumps(state["reference_answer"], indent=2) - }) \ No newline at end of file + }) diff --git a/scrapegraphai/utils/llm_callback_manager.py b/scrapegraphai/utils/llm_callback_manager.py index 03bdaf0b..86a4de83 100644 --- a/scrapegraphai/utils/llm_callback_manager.py +++ b/scrapegraphai/utils/llm_callback_manager.py @@ -1,6 +1,7 @@ """ This module provides a custom callback manager for the LLM models. """ + import threading from contextlib import contextmanager from langchain_community.callbacks import get_openai_callback diff --git a/scrapegraphai/utils/output_parser.py b/scrapegraphai/utils/output_parser.py index 3eabfa8b..b7bd1a85 100644 --- a/scrapegraphai/utils/output_parser.py +++ b/scrapegraphai/utils/output_parser.py @@ -22,7 +22,8 @@ def get_structured_output_parser(schema: Union[Dict[str, Any], return _dict_output_parser -def get_pydantic_output_parser(schema: Union[Dict[str, Any], Type[BaseModelV1 | BaseModelV2], Type]) -> JsonOutputParser: +def get_pydantic_output_parser(schema: Union[Dict[str, Any], + Type[BaseModelV1 | BaseModelV2], Type]) -> JsonOutputParser: """ Get the correct output parser for the LLM model. diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 07a36e49..7582bac7 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -3,7 +3,6 @@ """ import pandas as pd - def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame: """ Transforms the execution information of a graph into a DataFrame for enhanced visualization.