From 612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 18:37:50 +0800 Subject: [PATCH 01/14] feat: implement ScrapeGraph class for only web scraping automation --- scrapegraphai/graphs/scrape_graph.py | 98 ++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 scrapegraphai/graphs/scrape_graph.py diff --git a/scrapegraphai/graphs/scrape_graph.py b/scrapegraphai/graphs/scrape_graph.py new file mode 100644 index 00000000..a08149aa --- /dev/null +++ b/scrapegraphai/graphs/scrape_graph.py @@ -0,0 +1,98 @@ +""" +SmartScraperGraph Module +""" +from typing import Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..nodes import ( + FetchNode, + ParseNode, +) + +class ScrapeGraph(AbstractGraph): + """ + ScrapeGraph is a scraping pipeline that automates the process of + extracting information from web pages. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> scraper = ScraperGraph( + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) + """ + + def __init__(self, source: str, config: dict, prompt: str = "", schema: Optional[BaseModel] = None): + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + fetch_node = FetchNode( + input="url| local_dir", + output=["doc"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do") + } + ) + + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + ], + edges=[ + (fetch_node, parse_node), + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the scraping content. + + Returns: + str: The scraping content. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No document found.") From 3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 19:38:53 +0800 Subject: [PATCH 02/14] feat: Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. (Different from the SmartScraperMultiGraph is that in this case the content is merged before to be processed by the llm.) --- scrapegraphai/graphs/__init__.py | 2 + ...t_scraper_multi_parse_merge_first_graph.py | 103 ++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 5b217bc9..0acec56b 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -25,3 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph +from .smart_scraper_multi_parse_merge_first_graph import SmartScraperMultiParseMergeFirstGraph +from .scrape_graph import ScrapeGraph diff --git a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py new file mode 100644 index 00000000..860e2ca2 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py @@ -0,0 +1,103 @@ +""" +SmartScraperMultiGraph Module +""" +from copy import deepcopy +from typing import List, Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .scrape_graph import ScrapeGraph +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode, +) +from ..utils.copy import safe_deepcopy + +class SmartScraperMultiParseMergeFirstGraph(AbstractGraph): + """ + SmartScraperMultiParseMergeFirstGraph is a scraping pipeline that scrapes a + list of URLs and merge the content first and finally generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content is merged + before to be passed to the llm. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> search_graph = SmartScraperMultiParseMergeFirstGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping + and parsing and then merge the content and generates answers to a given prompt. + """ + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["parsed_doc"], + node_config={ + "graph_instance": ScrapeGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & parsed_doc", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and parsing process first and + then concatenate the content and generates answers to a given prompt. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + return self.final_state.get("answer", "No answer found.") From cdb3c1100ee1117afedbc70437317acaf7c7c1d3 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 20:05:03 +0800 Subject: [PATCH 03/14] test: Add scrape_graph test --- tests/graphs/scrape_graph_test.py | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/graphs/scrape_graph_test.py diff --git a/tests/graphs/scrape_graph_test.py b/tests/graphs/scrape_graph_test.py new file mode 100644 index 00000000..00d3f4fb --- /dev/null +++ b/tests/graphs/scrape_graph_test.py @@ -0,0 +1,50 @@ +""" +Module for testing the scrape graph class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import ScrapeGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + result = scrape_graph.run() + + assert result is not None + assert isinstance(result, list) + +def test_get_execution_info(graph_config): + """Get the execution info""" + scrape_graph = ScrapeGraph( + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + scrape_graph.run() + + graph_exec_info = scrape_graph.get_execution_info() + + assert graph_exec_info is not None From 464b8b04ea0d51280849173d5eda92d4d4db8612 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Wed, 16 Oct 2024 20:05:36 +0800 Subject: [PATCH 04/14] test: Add smart_scraper_multi_parse_merge_first_graph test --- ...aper_multi_parse_merge_first_graph_test.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py new file mode 100644 index 00000000..506ce5da --- /dev/null +++ b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py @@ -0,0 +1,59 @@ +""" +Module for testing the smart scraper class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiParseConcatFirstGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + openai_key = os.getenv("OPENAI_APIKEY") + + return { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + result = smart_scraper_multi_parse_concat_first_graph.run() + + assert result is not None + assert isinstance(result, dict) + +def test_get_execution_info(graph_config): + """Get the execution info""" + smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config, + ) + + smart_scraper_multi_parse_concat_first_graph.run() + + graph_exec_info = smart_scraper_multi_parse_concat_first_graph.get_execution_info() + + assert graph_exec_info is not None From 2512262be81b686f559711584e69c725dd53a187 Mon Sep 17 00:00:00 2001 From: shenghong Date: Thu, 17 Oct 2024 06:46:34 +0800 Subject: [PATCH 05/14] Rename smart_scraper_multi_parse_merge_first_graph_test.py to smart_scraper_multi_parse_merge_first_graph_openai_test.py --- ...=> smart_scraper_multi_parse_merge_first_graph_openai_test.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/graphs/{smart_scraper_multi_parse_merge_first_graph_test.py => smart_scraper_multi_parse_merge_first_graph_openai_test.py} (100%) diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py b/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py similarity index 100% rename from tests/graphs/smart_scraper_multi_parse_merge_first_graph_test.py rename to tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py From 69ff6495564a5c670b89c0f802ebb1602f0e7cfa Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:36:29 +0800 Subject: [PATCH 06/14] fix: fix the example variable name --- scrapegraphai/graphs/smart_scraper_multi_concat_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py index 312d6457..a13d8aa1 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py @@ -35,11 +35,11 @@ class SmartScraperMultiConcatGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( + >>> smart_scraper_multi_concat_graph = SmartScraperMultiConcatGraph( ... "What is Chioggia famous for?", ... {"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_concat_graph.run() """ def __init__(self, prompt: str, source: List[str], From 94d8042c2a510b29138127e1abd4ddd9e0b49ed0 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:39:42 +0800 Subject: [PATCH 07/14] rename smart_scraper_multi_graph to smart_scraper_multi_abstract_graph --- .../smart_scraper_multi_abstract_graph.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py diff --git a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py b/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py new file mode 100644 index 00000000..f5ffdf96 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py @@ -0,0 +1,104 @@ +""" +SmartScraperMultiGraph Module +""" +from copy import deepcopy +from typing import List, Optional +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) +from ..utils.copy import safe_deepcopy + +class SmartScraperMultiAbstractGraph(AbstractGraph): + """ + SmartScraperMultiAbstractGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content will be abstracted + by llm and then merged finally passed to the llm. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> smart_scraper_multi_abstract_graph = SmartScraperMultiAbstractGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper_multi_abstract_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): + + self.max_results = config.get("max_results", 3) + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["results"], + node_config={ + "graph_instance": SmartScraperGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") From dfc67c670d871fac5116223461a56c9560959eb9 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:49:54 +0800 Subject: [PATCH 08/14] rename the smart_scraper_multi_parse_merge_first_graph to smart_scraper_multi_graph,so delete this file --- ...t_scraper_multi_parse_merge_first_graph.py | 103 ------------------ 1 file changed, 103 deletions(-) delete mode 100644 scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py diff --git a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py b/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py deleted file mode 100644 index 860e2ca2..00000000 --- a/scrapegraphai/graphs/smart_scraper_multi_parse_merge_first_graph.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -SmartScraperMultiGraph Module -""" -from copy import deepcopy -from typing import List, Optional -from pydantic import BaseModel -from .base_graph import BaseGraph -from .abstract_graph import AbstractGraph -from .scrape_graph import ScrapeGraph -from ..nodes import ( - GraphIteratorNode, - MergeAnswersNode, -) -from ..utils.copy import safe_deepcopy - -class SmartScraperMultiParseMergeFirstGraph(AbstractGraph): - """ - SmartScraperMultiParseMergeFirstGraph is a scraping pipeline that scrapes a - list of URLs and merge the content first and finally generates answers to a given prompt. - It only requires a user prompt and a list of URLs. - The difference with the SmartScraperMultiGraph is that in this case the content is merged - before to be passed to the llm. - - Attributes: - prompt (str): The user prompt to search the internet. - llm_model (dict): The configuration for the language model. - embedder_model (dict): The configuration for the embedder model. - headless (bool): A flag to run the browser in headless mode. - verbose (bool): A flag to display the execution information. - model_token (int): The token limit for the language model. - - Args: - prompt (str): The user prompt to search the internet. - source (List[str]): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (Optional[BaseModel]): The schema for the graph output. - - Example: - >>> search_graph = SmartScraperMultiParseMergeFirstGraph( - ... prompt="Who is Marco Perini?", - ... source= [ - ... "https://perinim.github.io/", - ... "https://perinim.github.io/cv/" - ... ], - ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = search_graph.run() - """ - - def __init__(self, prompt: str, source: List[str], - config: dict, schema: Optional[BaseModel] = None): - - self.copy_config = safe_deepcopy(config) - self.copy_schema = deepcopy(schema) - super().__init__(prompt, config, source, schema) - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping - and parsing and then merge the content and generates answers to a given prompt. - """ - graph_iterator_node = GraphIteratorNode( - input="user_prompt & urls", - output=["parsed_doc"], - node_config={ - "graph_instance": ScrapeGraph, - "scraper_config": self.copy_config, - }, - schema=self.copy_schema - ) - - merge_answers_node = MergeAnswersNode( - input="user_prompt & parsed_doc", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "schema": self.copy_schema - } - ) - - return BaseGraph( - nodes=[ - graph_iterator_node, - merge_answers_node, - ], - edges=[ - (graph_iterator_node, merge_answers_node), - ], - entry_point=graph_iterator_node, - graph_name=self.__class__.__name__ - ) - - def run(self) -> str: - """ - Executes the web scraping and parsing process first and - then concatenate the content and generates answers to a given prompt. - - Returns: - str: The answer to the prompt. - """ - inputs = {"user_prompt": self.prompt, "urls": self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") From 78bd40c3b54cd656e0fe2e789e978b59dcb96d5b Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:51:26 +0800 Subject: [PATCH 09/14] modify the graph name --- scrapegraphai/graphs/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 0acec56b..bfb8e300 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -13,7 +13,7 @@ from .csv_scraper_graph import CSVScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .smart_scraper_multi_abstract_graph import SmartScraperMultiAbstractGraph from .json_scraper_multi_graph import JSONScraperMultiGraph from .csv_scraper_multi_graph import CSVScraperMultiGraph from .xml_scraper_multi_graph import XMLScraperMultiGraph @@ -25,5 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph -from .smart_scraper_multi_parse_merge_first_graph import SmartScraperMultiParseMergeFirstGraph +from .smart_scraper_multi_graph import SmartScraperMultiGraph from .scrape_graph import ScrapeGraph From 6dbac936683042ef2e517a71b6fb1655508a1568 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 01:52:39 +0800 Subject: [PATCH 10/14] rename the SmartScraperMultiParseMergeFirstGraph to SmartScraperMultiGraph --- .../graphs/smart_scraper_multi_graph.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 5dff3277..2f628e81 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -6,18 +6,20 @@ from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from .smart_scraper_graph import SmartScraperGraph +from .scrape_graph import ScrapeGraph from ..nodes import ( GraphIteratorNode, - MergeAnswersNode + MergeAnswersNode, ) from ..utils.copy import safe_deepcopy class SmartScraperMultiGraph(AbstractGraph): """ SmartScraperMultiGraph is a scraping pipeline that scrapes a - list of URLs and generates answers to a given prompt. + list of URLs and merge the content first and finally generates answers to a given prompt. It only requires a user prompt and a list of URLs. + The difference with the SmartScraperMultiGraph is that in this case the content is merged + before to be passed to the llm. Attributes: prompt (str): The user prompt to search the internet. @@ -34,42 +36,41 @@ class SmartScraperMultiGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> search_graph = MultipleSearchGraph( - ... "What is Chioggia famous for?", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = search_graph.run() + >>> result = smart_scraper_multi_graph.run() """ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) self.copy_config = safe_deepcopy(config) self.copy_schema = deepcopy(schema) - super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: """ - Creates the graph of nodes representing the workflow for web scraping and searching. - - Returns: - BaseGraph: A graph instance representing the web scraping and searching workflow. + Creates the graph of nodes representing the workflow for web scraping + and parsing and then merge the content and generates answers to a given prompt. """ - graph_iterator_node = GraphIteratorNode( input="user_prompt & urls", - output=["results"], + output=["parsed_doc"], node_config={ - "graph_instance": SmartScraperGraph, + "graph_instance": ScrapeGraph, "scraper_config": self.copy_config, }, schema=self.copy_schema ) merge_answers_node = MergeAnswersNode( - input="user_prompt & results", + input="user_prompt & parsed_doc", output=["answer"], node_config={ "llm_model": self.llm_model, @@ -91,12 +92,12 @@ def _create_graph(self) -> BaseGraph: def run(self) -> str: """ - Executes the web scraping and searching process. + Executes the web scraping and parsing process first and + then concatenate the content and generates answers to a given prompt. Returns: str: The answer to the prompt. """ inputs = {"user_prompt": self.prompt, "urls": self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") From 974f88a77e853884d8a83c0d44a79c013727cc55 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:01:59 +0800 Subject: [PATCH 11/14] rename SmartScraperMultiGraph to SmartScraperMultiLiteGraph --- ...r_multi_graph.py => smart_scraper_multi_lite_graph.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename scrapegraphai/graphs/{smart_scraper_multi_graph.py => smart_scraper_multi_lite_graph.py} (93%) diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py similarity index 93% rename from scrapegraphai/graphs/smart_scraper_multi_graph.py rename to scrapegraphai/graphs/smart_scraper_multi_lite_graph.py index 2f628e81..14e576d9 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py @@ -13,9 +13,9 @@ ) from ..utils.copy import safe_deepcopy -class SmartScraperMultiGraph(AbstractGraph): +class SmartScraperMultiLiteGraph(AbstractGraph): """ - SmartScraperMultiGraph is a scraping pipeline that scrapes a + SmartScraperMultiLiteGraph is a scraping pipeline that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. It only requires a user prompt and a list of URLs. The difference with the SmartScraperMultiGraph is that in this case the content is merged @@ -36,7 +36,7 @@ class SmartScraperMultiGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + >>> smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( ... prompt="Who is Marco Perini?", ... source= [ ... "https://perinim.github.io/", @@ -44,7 +44,7 @@ class SmartScraperMultiGraph(AbstractGraph): ... ], ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = smart_scraper_multi_graph.run() + >>> result = smart_scraper_multi_lite_graph.run() """ def __init__(self, prompt: str, source: List[str], From 3e8f047ab606db4549c5d3b28b681f47b8c08725 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:10:57 +0800 Subject: [PATCH 12/14] Renamed smart_scraper_multi_abstract_graph back to smart_scraper_multi_graph. --- scrapegraphai/graphs/__init__.py | 4 ++-- ...t_graph.py => smart_scraper_multi_graph.py} | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) rename scrapegraphai/graphs/{smart_scraper_multi_abstract_graph.py => smart_scraper_multi_graph.py} (84%) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index bfb8e300..9c8bc820 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -13,7 +13,7 @@ from .csv_scraper_graph import CSVScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .smart_scraper_multi_abstract_graph import SmartScraperMultiAbstractGraph +from .smart_scraper_multi_graph import SmartScraperMultiGraph from .json_scraper_multi_graph import JSONScraperMultiGraph from .csv_scraper_multi_graph import CSVScraperMultiGraph from .xml_scraper_multi_graph import XMLScraperMultiGraph @@ -25,5 +25,5 @@ from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph from .depth_search_graph import DepthSearchGraph -from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph from .scrape_graph import ScrapeGraph diff --git a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py similarity index 84% rename from scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py rename to scrapegraphai/graphs/smart_scraper_multi_graph.py index f5ffdf96..420dc784 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_abstract_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -13,12 +13,12 @@ ) from ..utils.copy import safe_deepcopy -class SmartScraperMultiAbstractGraph(AbstractGraph): +class SmartScraperMultiGraph(AbstractGraph): """ - SmartScraperMultiAbstractGraph is a scraping pipeline that scrapes a + SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. - The difference with the SmartScraperMultiGraph is that in this case the content will be abstracted + The difference with the SmartScraperMultiLiteGraph is that in this case the content will be abstracted by llm and then merged finally passed to the llm. Attributes: @@ -36,11 +36,15 @@ class SmartScraperMultiAbstractGraph(AbstractGraph): schema (Optional[BaseModel]): The schema for the graph output. Example: - >>> smart_scraper_multi_abstract_graph = SmartScraperMultiAbstractGraph( - ... "What is Chioggia famous for?", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + >>> smart_scraper_multi_graph = SmartScraperMultiGraph( + ... prompt="Who is Marco Perini?", + ... source= [ + ... "https://perinim.github.io/", + ... "https://perinim.github.io/cv/" + ... ], + ... config={"llm": {"model": "openai/gpt-3.5-turbo"}} ... ) - >>> result = smart_scraper_multi_abstract_graph.run() + >>> result = smart_scraper_multi_graph.run() """ def __init__(self, prompt: str, source: List[str], From 28dda2b476e1b2da9e39cc212133fcaca7cc5b11 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:14:08 +0800 Subject: [PATCH 13/14] rename graph name --- ...=> smart_scraper_multi_lite_graph_openai_test.py} | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) rename tests/graphs/{smart_scraper_multi_parse_merge_first_graph_openai_test.py => smart_scraper_multi_lite_graph_openai_test.py} (70%) diff --git a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py similarity index 70% rename from tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py rename to tests/graphs/smart_scraper_multi_lite_graph_openai_test.py index 506ce5da..0a0e0a69 100644 --- a/tests/graphs/smart_scraper_multi_parse_merge_first_graph_openai_test.py +++ b/tests/graphs/smart_scraper_multi_lite_graph_openai_test.py @@ -6,7 +6,7 @@ import pytest import pandas as pd from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperMultiParseConcatFirstGraph +from scrapegraphai.graphs import SmartScraperMultiLiteGraph from scrapegraphai.utils import prettify_exec_info load_dotenv() @@ -27,7 +27,7 @@ def graph_config(): def test_scraping_pipeline(graph_config): """Start of the scraping pipeline""" - smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", source= [ "https://perinim.github.io/", @@ -36,14 +36,14 @@ def test_scraping_pipeline(graph_config): config=graph_config, ) - result = smart_scraper_multi_parse_concat_first_graph.run() + result = smart_scraper_multi_lite_graph.run() assert result is not None assert isinstance(result, dict) def test_get_execution_info(graph_config): """Get the execution info""" - smart_scraper_multi_parse_concat_first_graph = SmartScraperMultiParseConcatFirstGraph( + smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( prompt="Who is Marco Perini?", source= [ "https://perinim.github.io/", @@ -52,8 +52,8 @@ def test_get_execution_info(graph_config): config=graph_config, ) - smart_scraper_multi_parse_concat_first_graph.run() + smart_scraper_multi_lite_graph.run() - graph_exec_info = smart_scraper_multi_parse_concat_first_graph.get_execution_info() + graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() assert graph_exec_info is not None From da2a3c8ec7d9c3c7e805fd6193035bd1bc284375 Mon Sep 17 00:00:00 2001 From: roryhaung Date: Fri, 18 Oct 2024 03:19:00 +0800 Subject: [PATCH 14/14] add smart_scraper_multi_lite_graph example --- .../openai/smart_scraper_multi_lite_openai.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 examples/openai/smart_scraper_multi_lite_openai.py diff --git a/examples/openai/smart_scraper_multi_lite_openai.py b/examples/openai/smart_scraper_multi_lite_openai.py new file mode 100644 index 00000000..69eeafc7 --- /dev/null +++ b/examples/openai/smart_scraper_multi_lite_openai.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiLiteGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_multi_lite_graph = SmartScraperMultiLiteGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + config=graph_config +) + +result = smart_scraper_multi_lite_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_multi_lite_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info))