diff --git a/examples/openai/fetch_multiple_links.py b/examples/openai/fetch_multiple_links.py new file mode 100644 index 00000000..c9c07877 --- /dev/null +++ b/examples/openai/fetch_multiple_links.py @@ -0,0 +1,22 @@ + +from scrapegraphai.graphs import DepthSearchGraph + +graph_config = { + "llm": { + "api_key":"YOUR_API_KEY", + "model": "openai/gpt-4o-mini", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": True, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = search_graph.run() +print(result) \ No newline at end of file diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index efd6bd7e..b5ffcc47 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -26,3 +26,4 @@ from .screenshot_scraper_graph import ScreenshotScraperGraph from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph +from .depth_search_graph import DepthSearchGraph diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py new file mode 100644 index 00000000..6ad3b245 --- /dev/null +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -0,0 +1,109 @@ +""" +... Module +""" +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..utils.save_code_to_file import save_code_to_file +from ..nodes import ( + FetchNodeLevelK, + ParseNodeDepthK +) + +class DepthSearchGraph(AbstractGraph): + """ + CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for + extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup. + It requires a user prompt, a source URL, and an output schema. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + library (str): The library used for web scraping (beautiful soup). + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> code_gen = CodeGeneratorGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = code_gen.run() + ) + """ + + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): + + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNodeLevelK( + input="url| local_dir", + output=["docs"], + node_config={ + "loader_kwargs": self.config.get("loader_kwargs", {}), + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "browser_base": self.config.get("browser_base"), + "depth": self.config.get("depth", 1), + "only_inside_links": self.config.get("only_inside_links", False) + } + ) + + parse_node = ParseNodeDepthK( + input="docs", + output=["docs"], + node_config={ + "verbose": self.config.get("verbose", False) + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node + ], + edges=[ + (fetch_node, parse_node), + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the generated code. + + Returns: + str: The generated code. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + docs = self.final_state.get("docs", "No docs") + + return docs \ No newline at end of file diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index e5fafb87..edb195a5 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -28,6 +28,7 @@ from .generate_code_node import GenerateCodeNode from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode -from .fetch_node_level_k import FetchNodelevelK +from .fetch_node_level_k import FetchNodeLevelK from .generate_answer_node_k_level import GenerateAnswerNodeKLevel from .description_node import DescriptionNode +from .parse_node_depth_k import ParseNodeDepthK diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 18a0d435..5cdd6571 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -1,15 +1,21 @@ """ -FetchNodelevelK Module +FetchNodeLevelK Module """ from typing import List, Optional from .base_node import BaseNode +from ..docloaders import ChromiumLoader +from ..utils.cleanup_html import cleanup_html +from ..utils.convert_to_md import convert_to_md +from langchain_core.documents import Document +from bs4 import BeautifulSoup +from urllib.parse import quote, urljoin -class FetchNodelevelK(BaseNode): +class FetchNodeLevelK(BaseNode): """ - A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. Relevant chunks are stored in the state. - - It allows scraping of big documents without exceeding the token limit of the language model. + A node responsible for fetching the HTML content of a specified URL and all its sub-links + recursively up to a certain level of hyperlink the graph. This content is then used to update + the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously + (with proxy protection). Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -27,16 +33,158 @@ def __init__( input: str, output: List[str], node_config: Optional[dict] = None, - node_name: str = "RAG", + node_name: str = "FetchLevelK", ): super().__init__(node_name, "node", input, output, 2, node_config) - - self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) + self.cache_path = node_config.get("cache_path", False) + + self.headless = ( + True if node_config is None else node_config.get("headless", True) + ) + + self.loader_kwargs = ( + {} if node_config is None else node_config.get("loader_kwargs", {}) + ) + + self.browser_base = ( + None if node_config is None else node_config.get("browser_base", None) + ) + + self.depth = ( + 1 if node_config is None else node_config.get("depth", 1) + ) + + self.only_inside_links = ( + False if node_config is None else node_config.get("only_inside_links", False) + ) + + self.min_input_len = 1 def execute(self, state: dict) -> dict: - pass + """ + Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links + and update the graph's state with the content. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data types from the state. + + Returns: + dict: The updated state with a new output key containing the fetched HTML content. + + Raises: + KeyError: If the input key is not found in the state, indicating that the + necessary information to perform the operation is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + source = input_data[0] + + documents = [{"source": source}] + + loader_kwargs = {} + + if self.node_config is not None: + loader_kwargs = self.node_config.get("loader_kwargs", {}) + + for _ in range(self.depth): + documents = self.obtain_content(documents, loader_kwargs) + + filtered_documents = [doc for doc in documents if 'document' in doc] + + state.update({self.output[0]: filtered_documents}) + + return state + + def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: + self.logger.info(f"--- (Fetching HTML from: {source}) ---") + + if self.browser_base is not None: + try: + from ..docloaders.browser_base import browser_base_fetch + except ImportError: + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") + + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + + document = [Document(page_content=content, + metadata={"source": source}) for content in data] + + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + + document = loader.load() + + return document + + def extract_links(self, html_content: str) -> list: + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + self.logger.info(f"Extracted {len(links)} links.") + return links + + def get_full_links(self, base_url: str, links: list) -> list: + full_links = [] + for link in links: + if self.only_inside_links and link.startswith("http"): + continue + full_link = link if link.startswith("http") else urljoin(base_url, link) + full_links.append(full_link) + return full_links + + def obtain_content(self, documents: List, loader_kwargs) -> List: + new_documents = [] + for doc in documents: + source = doc['source'] + if 'document' not in doc: + document = self.fetch_content(source, loader_kwargs) + + if not document or not document[0].page_content.strip(): + self.logger.warning(f"Failed to fetch content for {source}") + documents.remove(doc) + continue + + #doc['document'] = document[0].page_content + doc['document'] = document + + links = self.extract_links(doc['document'][0].page_content) + full_links = self.get_full_links(source, links) + + # Check if the links are already present in other documents + for link in full_links: + # Check if any document is from the same link + if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): + # Add the document + new_documents.append({"source": link}) + + documents.extend(new_documents) + return documents + + def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict: + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + self.logger.info(f"Processing link {idx}: {full_link}") + link_content = self.fetch_content(full_link, loader_kwargs) + + if current_depth < depth: + new_links = self.extract_links(link_content) + content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1)) + else: + self.logger.warning(f"Failed to fetch content for {full_link}") + return content_dict \ No newline at end of file diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py new file mode 100644 index 00000000..7b7ab194 --- /dev/null +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -0,0 +1,72 @@ +""" +ParseNodeDepthK Module +""" +import re +from typing import List, Optional, Tuple +from .base_node import BaseNode +from ..utils.convert_to_md import convert_to_md +from langchain_community.document_transformers import Html2TextTransformer + +class ParseNodeDepthK(BaseNode): + """ + A node responsible for parsing HTML content from a series of documents. + + This node enhances the scraping workflow by allowing for targeted extraction of + content, thereby optimizing the processing of large HTML documents. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "ParseNodeDepthK", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to parse the HTML documents content. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. + + Returns: + dict: The updated state with the output key containing the parsed content chunks. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + documents = input_data[0] + + for doc in documents: + document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"]) + #document_md = convert_to_md(doc["document"]) + doc["document"] = document_md[0].page_content + + state.update({self.output[0]: documents}) + + return state diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py new file mode 100644 index 00000000..21703b7b --- /dev/null +++ b/scrapegraphai/utils/1_manual.py @@ -0,0 +1,92 @@ +import requests +import logging +import time +from urllib.parse import quote, urljoin +from typing import Optional +from bs4 import BeautifulSoup +from dotenv import load_dotenv +import os +import json +import markdownify + +load_dotenv() + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]: + encoded_url = quote(target_url) + url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0" + + for attempt in range(max_retries): + try: + response = requests.get(url) + if response.status_code == 200: + logging.info(f"Successfully fetched content from {target_url}") + return response.text + logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...") + except requests.RequestException as e: + logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...") + time.sleep(retry_delay) + + logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.") + return None + +def extract_links(html_content: str) -> list: + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + logging.info(f"Extracted {len(links)} links.") + return links + +def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict: + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + logging.info(f"Processing link {idx}: {full_link}") + link_content = fetch_content(token, full_link) + if link_content: + markdown_content = markdownify.markdownify(link_content, heading_style="ATX") + content_dict[full_link] = markdown_content + save_content_to_json(content_dict, idx) + + if current_depth < depth: + new_links = extract_links(link_content) + content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1)) + else: + logging.warning(f"Failed to fetch content for {full_link}") + return content_dict + +def save_content_to_json(content_dict: dict, idx: int): + if not os.path.exists("downloaded_pages"): + os.makedirs("downloaded_pages") + + file_name = f"scraped_content_{idx}.json" + file_path = os.path.join("downloaded_pages", file_name) + + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(content_dict, json_file, ensure_ascii=False, indent=4) + + logging.info(f"Content saved to {file_path}") + +if __name__ == "__main__": + token = os.getenv("TOKEN") + target_url = "https://www.wired.com" + depth = 2 + + if not token or not target_url: + logging.error("Please set the TOKEN and TARGET_URL environment variables.") + exit(1) + + html_content = fetch_content(token, target_url) + + if html_content: + links = extract_links(html_content) + logging.info("Links found:") + for link in links: + logging.info(link) + + content_dict = process_links(token, target_url, links, depth) + for link, content in content_dict.items(): + logging.info(f"Link: {link}") + logging.info(f"Content: {content[:500]}...") + else: + logging.error("Failed to fetch the content.")