Skip to content
22 changes: 22 additions & 0 deletions examples/openai/fetch_multiple_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

from scrapegraphai.graphs import DepthSearchGraph

graph_config = {
"llm": {
"api_key":"YOUR_API_KEY",
"model": "openai/gpt-4o-mini",
},
"verbose": True,
"headless": False,
"depth": 2,
"only_inside_links": True,
}

search_graph = DepthSearchGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
config=graph_config
)

result = search_graph.run()
print(result)
1 change: 1 addition & 0 deletions scrapegraphai/graphs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@
from .screenshot_scraper_graph import ScreenshotScraperGraph
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
from .code_generator_graph import CodeGeneratorGraph
from .depth_search_graph import DepthSearchGraph
109 changes: 109 additions & 0 deletions scrapegraphai/graphs/depth_search_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""
... Module
"""
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..utils.save_code_to_file import save_code_to_file
from ..nodes import (
FetchNodeLevelK,
ParseNodeDepthK
)

class DepthSearchGraph(AbstractGraph):
"""
CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
It requires a user prompt, a source URL, and an output schema.

Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
library (str): The library used for web scraping (beautiful soup).

Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.

Example:
>>> code_gen = CodeGeneratorGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = code_gen.run()
)
"""

def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):

super().__init__(prompt, config, source, schema)

self.input_key = "url" if source.startswith("http") else "local_dir"

def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.

Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""

fetch_node = FetchNodeLevelK(
input="url| local_dir",
output=["docs"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"browser_base": self.config.get("browser_base"),
"depth": self.config.get("depth", 1),
"only_inside_links": self.config.get("only_inside_links", False)
}
)

parse_node = ParseNodeDepthK(
input="docs",
output=["docs"],
node_config={
"verbose": self.config.get("verbose", False)
}
)

return BaseGraph(
nodes=[
fetch_node,
parse_node
],
edges=[
(fetch_node, parse_node),
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)

def run(self) -> str:
"""
Executes the scraping process and returns the generated code.

Returns:
str: The generated code.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

docs = self.final_state.get("docs", "No docs")

return docs
3 changes: 2 additions & 1 deletion scrapegraphai/nodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .generate_code_node import GenerateCodeNode
from .search_node_with_context import SearchLinksWithContext
from .reasoning_node import ReasoningNode
from .fetch_node_level_k import FetchNodelevelK
from .fetch_node_level_k import FetchNodeLevelK
from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
from .description_node import DescriptionNode
from .parse_node_depth_k import ParseNodeDepthK
168 changes: 158 additions & 10 deletions scrapegraphai/nodes/fetch_node_level_k.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
"""
FetchNodelevelK Module
FetchNodeLevelK Module
"""
from typing import List, Optional
from .base_node import BaseNode
from ..docloaders import ChromiumLoader
from ..utils.cleanup_html import cleanup_html
from ..utils.convert_to_md import convert_to_md
from langchain_core.documents import Document
from bs4 import BeautifulSoup
from urllib.parse import quote, urljoin

class FetchNodelevelK(BaseNode):
class FetchNodeLevelK(BaseNode):
"""
A node responsible for compressing the input tokens and storing the document
in a vector database for retrieval. Relevant chunks are stored in the state.

It allows scraping of big documents without exceeding the token limit of the language model.
A node responsible for fetching the HTML content of a specified URL and all its sub-links
recursively up to a certain level of hyperlink the graph. This content is then used to update
the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously
(with proxy protection).

Attributes:
llm_model: An instance of a language model client, configured for generating answers.
Expand All @@ -27,16 +33,158 @@ def __init__(
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "RAG",
node_name: str = "FetchLevelK",
):
super().__init__(node_name, "node", input, output, 2, node_config)

self.llm_model = node_config["llm_model"]

self.embedder_model = node_config.get("embedder_model", None)

self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)

self.cache_path = node_config.get("cache_path", False)

self.headless = (
True if node_config is None else node_config.get("headless", True)
)

self.loader_kwargs = (
{} if node_config is None else node_config.get("loader_kwargs", {})
)

self.browser_base = (
None if node_config is None else node_config.get("browser_base", None)
)

self.depth = (
1 if node_config is None else node_config.get("depth", 1)
)

self.only_inside_links = (
False if node_config is None else node_config.get("only_inside_links", False)
)

self.min_input_len = 1

def execute(self, state: dict) -> dict:
pass
"""
Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
and update the graph's state with the content.

Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data types from the state.

Returns:
dict: The updated state with a new output key containing the fetched HTML content.

Raises:
KeyError: If the input key is not found in the state, indicating that the
necessary information to perform the operation is missing.
"""

self.logger.info(f"--- Executing {self.node_name} Node ---")

# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)
# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys]

source = input_data[0]

documents = [{"source": source}]

loader_kwargs = {}

if self.node_config is not None:
loader_kwargs = self.node_config.get("loader_kwargs", {})

for _ in range(self.depth):
documents = self.obtain_content(documents, loader_kwargs)

filtered_documents = [doc for doc in documents if 'document' in doc]

state.update({self.output[0]: filtered_documents})

return state

def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
self.logger.info(f"--- (Fetching HTML from: {source}) ---")

if self.browser_base is not None:
try:
from ..docloaders.browser_base import browser_base_fetch
except ImportError:
raise ImportError("""The browserbase module is not installed.
Please install it using `pip install browserbase`.""")

data = browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), [source])

document = [Document(page_content=content,
metadata={"source": source}) for content in data]

else:
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)

document = loader.load()

return document

def extract_links(self, html_content: str) -> list:
soup = BeautifulSoup(html_content, 'html.parser')
links = [link['href'] for link in soup.find_all('a', href=True)]
self.logger.info(f"Extracted {len(links)} links.")
return links

def get_full_links(self, base_url: str, links: list) -> list:
full_links = []
for link in links:
if self.only_inside_links and link.startswith("http"):
continue
full_link = link if link.startswith("http") else urljoin(base_url, link)
full_links.append(full_link)
return full_links

def obtain_content(self, documents: List, loader_kwargs) -> List:
new_documents = []
for doc in documents:
source = doc['source']
if 'document' not in doc:
document = self.fetch_content(source, loader_kwargs)

if not document or not document[0].page_content.strip():
self.logger.warning(f"Failed to fetch content for {source}")
documents.remove(doc)
continue

#doc['document'] = document[0].page_content
doc['document'] = document

links = self.extract_links(doc['document'][0].page_content)
full_links = self.get_full_links(source, links)

# Check if the links are already present in other documents
for link in full_links:
# Check if any document is from the same link
if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
# Add the document
new_documents.append({"source": link})

documents.extend(new_documents)
return documents

def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
content_dict = {}
for idx, link in enumerate(links, start=1):
full_link = link if link.startswith("http") else urljoin(base_url, link)
self.logger.info(f"Processing link {idx}: {full_link}")
link_content = self.fetch_content(full_link, loader_kwargs)

if current_depth < depth:
new_links = self.extract_links(link_content)
content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
else:
self.logger.warning(f"Failed to fetch content for {full_link}")
return content_dict
Loading