diff --git a/README.md b/README.md
index cf437203..5d79bf55 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
- More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+ This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+ ```bash
+ pip install scrapegraphai[other-language-models]
```
- Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz.
@@ -55,23 +54,9 @@ pip install scrapegraphai[other-language-models]
pip install scrapegraphai[more-browser-options]
```
-- faiss Options: this group includes faiss integration
-
- ```bash
- pip install scrapegraphai[faiss-cpu]
- ```
-
-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_haiku.py
rename to examples/anthropic/csv_scraper_anthropic.py
diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/csv_scraper_graph_multi_haiku.py
rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py
diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/custom_graph_haiku.py
rename to examples/anthropic/custom_graph_anthropic.py
diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py
new file mode 100644
index 00000000..8cac7bea
--- /dev/null
+++ b/examples/anthropic/depth_search_graph_anthropic.py
@@ -0,0 +1,28 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("ANTHROPIC_API_KEY"),
+ "model": "openai/gpt-4o-mini",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_haiku.py
rename to examples/anthropic/json_scraper_anthropic.py
diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/json_scraper_multi_haiku.py
rename to examples/anthropic/json_scraper_multi_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_graph_haiku.py
rename to examples/anthropic/pdf_scraper_graph_anthropic.py
diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/pdf_scraper_multi_haiku.py
rename to examples/anthropic/pdf_scraper_multi_anthropic.py
diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_anthropic.py
similarity index 100%
rename from examples/anthropic/rate_limit_haiku.py
rename to examples/anthropic/rate_limit_anthropic.py
diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py
similarity index 100%
rename from examples/anthropic/scrape_plain_text_haiku.py
rename to examples/anthropic/scrape_plain_text_anthropic.py
diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_generator_haiku.py
rename to examples/anthropic/script_generator_anthropic.py
diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py
similarity index 100%
rename from examples/anthropic/script_multi_generator_haiku.py
rename to examples/anthropic/script_multi_generator_anthropic.py
diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_haiku.py
rename to examples/anthropic/search_graph_anthropic.py
diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/search_graph_schema_haiku.py
rename to examples/anthropic/search_graph_schema_anthropic.py
diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py
similarity index 100%
rename from examples/anthropic/search_link_graph_haiku.py
rename to examples/anthropic/search_link_graph_anthropic.py
diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_haiku.py
rename to examples/anthropic/smart_scraper_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_haiku.py
rename to examples/anthropic/smart_scraper_multi_anthropic.py
diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_multi_concat_haiku.py
rename to examples/anthropic/smart_scraper_multi_concat_anthropic.py
diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py
similarity index 100%
rename from examples/anthropic/smart_scraper_schema_haiku.py
rename to examples/anthropic/smart_scraper_schema_anthropic.py
diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_haiku.py
rename to examples/anthropic/xml_scraper_anthropic.py
diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py
similarity index 100%
rename from examples/anthropic/xml_scraper_graph_multi_haiku.py
rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py
diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py
index ad48933f..4bad1b0d 100644
--- a/examples/azure/code_generator_graph_azure.py
+++ b/examples/azure/code_generator_graph_azure.py
@@ -28,7 +28,7 @@ class Projects(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False,
diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py
index efc99758..272527b3 100644
--- a/examples/azure/csv_scraper_azure.py
+++ b/examples/azure/csv_scraper_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py
index d9160c40..cccbf88e 100644
--- a/examples/azure/csv_scraper_graph_multi_azure.py
+++ b/examples/azure/csv_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py
new file mode 100644
index 00000000..88b2cd1b
--- /dev/null
+++ b/examples/azure/depth_search_graph_azure.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": os.environ["AZURE_OPENAI_KEY"],
+ "model": "azure_openai/gpt-4o",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py
index 483544fe..5ba54f7b 100644
--- a/examples/azure/json_scraper_azure.py
+++ b/examples/azure/json_scraper_azure.py
@@ -23,7 +23,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py
index ecf97280..befc4e84 100644
--- a/examples/azure/json_scraper_multi_azure.py
+++ b/examples/azure/json_scraper_multi_azure.py
@@ -12,7 +12,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py
index f8926489..02b3b7e6 100644
--- a/examples/azure/pdf_scraper_azure.py
+++ b/examples/azure/pdf_scraper_azure.py
@@ -10,7 +10,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py
index cfd05f1f..892996c7 100644
--- a/examples/azure/rate_limit_azure.py
+++ b/examples/azure/rate_limit_azure.py
@@ -26,7 +26,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o",
"rate_limit": {
"requests_per_second": 1
},
diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py
index ef0d7d1c..9ea18d07 100644
--- a/examples/azure/scrape_plain_text_azure.py
+++ b/examples/azure/scrape_plain_text_azure.py
@@ -28,7 +28,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py
index 12f5d6be..b2bbb220 100644
--- a/examples/azure/script_generator_azure.py
+++ b/examples/azure/script_generator_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py
index a1bb8dbd..8c52cb95 100644
--- a/examples/azure/script_multi_generator_azure.py
+++ b/examples/azure/script_multi_generator_azure.py
@@ -16,7 +16,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py
index 13547e06..949f134c 100644
--- a/examples/azure/search_graph_azure.py
+++ b/examples/azure/search_graph_azure.py
@@ -22,7 +22,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py
index 629c92ab..e8c10093 100644
--- a/examples/azure/search_graph_schema_azure.py
+++ b/examples/azure/search_graph_schema_azure.py
@@ -30,7 +30,7 @@ class Dishes(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py
index aec2297b..42ed07ad 100644
--- a/examples/azure/search_link_graph_azure.py
+++ b/examples/azure/search_link_graph_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py
index bf3bc8d7..933dc5b0 100644
--- a/examples/azure/smart_scraper_azure.py
+++ b/examples/azure/smart_scraper_azure.py
@@ -26,7 +26,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py
index f1f3451e..e066eaf1 100644
--- a/examples/azure/smart_scraper_multi_azure.py
+++ b/examples/azure/smart_scraper_multi_azure.py
@@ -14,7 +14,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py
index e3870a4c..06d08b9a 100644
--- a/examples/azure/smart_scraper_multi_concat_azure.py
+++ b/examples/azure/smart_scraper_multi_concat_azure.py
@@ -15,7 +15,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py
index d0816bf5..d2766ecb 100644
--- a/examples/azure/smart_scraper_schema_azure.py
+++ b/examples/azure/smart_scraper_schema_azure.py
@@ -29,7 +29,7 @@ class Projects(BaseModel):
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py
index ecfb8743..1c40f3e7 100644
--- a/examples/azure/xml_scraper_azure.py
+++ b/examples/azure/xml_scraper_azure.py
@@ -24,7 +24,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o"
},
"verbose": True,
"headless": False
diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py
index db4db108..972eb823 100644
--- a/examples/azure/xml_scraper_graph_multi_azure.py
+++ b/examples/azure/xml_scraper_graph_multi_azure.py
@@ -25,7 +25,7 @@
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
- "model": "azure_openai/gpt-3.5-turbo",
+ "model": "azure_openai/gpt-4o",
},
"verbose": True,
"headless": False
diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py
new file mode 100644
index 00000000..2ab88291
--- /dev/null
+++ b/examples/bedrock/depth_search_graph_bedrock.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "client": "client_name",
+ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+ "temperature": 0.0
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py
new file mode 100644
index 00000000..064690a5
--- /dev/null
+++ b/examples/deepseek/depth_search_graph_deepseek.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+deepseek_key = os.getenv("DEEPSEEK_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "deepseek/deepseek-chat",
+ "api_key": deepseek_key,
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py
index 57d422e5..a3082cf7 100644
--- a/examples/ernie/custom_graph_ernie.py
+++ b/examples/ernie/custom_graph_ernie.py
@@ -14,7 +14,7 @@
# Define the configuration for the graph
# ************************************************
-graph_config = {
+graph_config = {
"llm": {
"model": "ernie/ernie-bot-turbo",
"ernie_client_id": "",
diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py
new file mode 100644
index 00000000..99470d8d
--- /dev/null
+++ b/examples/ernie/depth_search_graph_ernie.py
@@ -0,0 +1,26 @@
+"""
+depth_search_graph_opeani example
+"""
+from scrapegraphai.graphs import DepthSearchGraph
+
+graph_config = {
+ "llm": {
+ "model": "ernie/ernie-bot-turbo",
+ "ernie_client_id": "",
+ "ernie_client_secret": "",
+ "temperature": 0.1
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py
new file mode 100644
index 00000000..f467be9f
--- /dev/null
+++ b/examples/fireworks/depth_search_graph_fireworks.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": fireworks_api_key,
+ "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..956341f4
--- /dev/null
+++ b/examples/google_genai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": gemini_key,
+ "model": "google_genai/gemini-pro",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py
new file mode 100644
index 00000000..13bba630
--- /dev/null
+++ b/examples/google_vertexai/depth_search_graph_gemini.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": gemini_key,
+ "model": "google_vertexai/gemini-1.5-pro",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py
new file mode 100644
index 00000000..2d1ed8b1
--- /dev/null
+++ b/examples/groq/depth_search_graph_groq.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "groq/gemma-7b-it",
+ "api_key": groq_key,
+ "temperature": 0
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py
index cec007b7..06b2f089 100644
--- a/examples/huggingfacehub/custom_graph_huggingfacehub.py
+++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py
@@ -4,7 +4,6 @@
import os
from dotenv import load_dotenv
-
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from scrapegraphai.graphs import BaseGraph
diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
new file mode 100644
index 00000000..48df3e37
--- /dev/null
+++ b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py
@@ -0,0 +1,38 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+load_dotenv()
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+ repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+ api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+graph_config = {
+ "llm": {"model_instance": llm_model_instance},
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/local_models/depth_search_graph_ollama.py
new file mode 100644
index 00000000..d0f960b5
--- /dev/null
+++ b/examples/local_models/depth_search_graph_ollama.py
@@ -0,0 +1,32 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "model": "ollama/llama3.1",
+ "temperature": 0,
+ "format": "json", # Ollama needs the format to be specified explicitly
+ # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py
index 6e9c3da3..e80bf5ec 100644
--- a/examples/local_models/json_scraper_multi_ollama.py
+++ b/examples/local_models/json_scraper_multi_ollama.py
@@ -15,6 +15,7 @@
"verbose": True,
"headless": False,
}
+
FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py
index 35503bd7..5a5b3cea 100644
--- a/examples/local_models/smart_scraper_schema_ollama.py
+++ b/examples/local_models/smart_scraper_schema_ollama.py
@@ -24,7 +24,6 @@ class Projects(BaseModel):
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
-
"verbose": True,
"headless": False
}
diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py
new file mode 100644
index 00000000..ae18ffba
--- /dev/null
+++ b/examples/mistral/depth_search_graph_mistral.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+mistral_key = os.getenv("MISTRAL_API_KEY")
+
+graph_config = {
+ "llm": {
+ "api_key": mistral_key,
+ "model": "mistralai/open-mistral-nemo",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py
new file mode 100644
index 00000000..edd80463
--- /dev/null
+++ b/examples/nemotron/depth_search_graph_nemotron.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": os.getenv("NEMOTRON_KEY"),
+ "model": "claude-3-haiku-20240307",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/oneapi/depth_search_graph_onenapi.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": "***************************",
+ "model": "oneapi/qwen-turbo",
+ "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/openai/depth_search_graph_openai.py b/examples/openai/depth_search_graph_openai.py
new file mode 100644
index 00000000..dff07ad4
--- /dev/null
+++ b/examples/openai/depth_search_graph_openai.py
@@ -0,0 +1,30 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": openai_key,
+ "model": "openai/gpt-4o-mini",
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py
new file mode 100644
index 00000000..7a2e7f3e
--- /dev/null
+++ b/examples/together/depth_search_graph_together.py
@@ -0,0 +1,31 @@
+"""
+depth_search_graph_opeani example
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import DepthSearchGraph
+
+load_dotenv()
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+ "llm": {
+ "api_key": "***************************",
+ "model": "oneapi/qwen-turbo",
+ "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL
+ },
+ "verbose": True,
+ "headless": False,
+ "depth": 2,
+ "only_inside_links": False,
+}
+
+search_graph = DepthSearchGraph(
+ prompt="List me all the projects with their description",
+ source="https://perinim.github.io",
+ config=graph_config
+)
+
+result = search_graph.run()
+print(result)
diff --git a/pyproject.toml b/pyproject.toml
index 26b1fdb7..4c5e5117 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,9 @@ dependencies = [
"google>=3.0.0",
"langchain-ollama>=0.1.3",
"semchunk==2.2.0",
- "transformers==4.44.2"
+ "transformers==4.44.2",
+ "qdrant-client>=1.11.3",
+ "fastembed>=0.3.6"
]
license = "MIT"
@@ -99,11 +101,6 @@ screenshot_scraper = [
"pillow>=10.4.0",
]
-# Group 5: Faiss CPU
-faiss-cpu = [
- "faiss-cpu>=1.8.0",
-]
-
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 1d9d469a..3423cef0 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -64,6 +64,8 @@ click==8.1.7
# via burr
# via streamlit
# via uvicorn
+coloredlogs==15.0.1
+ # via onnxruntime
contourpy==1.2.1
# via matplotlib
cycler==0.12.1
@@ -84,9 +86,13 @@ fastapi==0.112.0
# via burr
fastapi-pagination==0.12.26
# via burr
+fastembed==0.3.6
+ # via scrapegraphai
filelock==3.15.4
# via huggingface-hub
# via transformers
+flatbuffers==24.3.25
+ # via onnxruntime
fonttools==4.53.1
# via matplotlib
free-proxy==1.1.1
@@ -132,11 +138,19 @@ greenlet==3.0.3
grpcio==1.65.4
# via google-api-core
# via grpcio-status
+ # via grpcio-tools
+ # via qdrant-client
grpcio-status==1.62.3
# via google-api-core
+grpcio-tools==1.62.3
+ # via qdrant-client
h11==0.14.0
# via httpcore
# via uvicorn
+h2==4.1.0
+ # via httpx
+hpack==4.0.0
+ # via h2
html2text==2024.2.26
# via scrapegraphai
httpcore==1.0.5
@@ -149,11 +163,17 @@ httpx==0.27.0
# via langsmith
# via ollama
# via openai
+ # via qdrant-client
httpx-sse==0.4.0
# via langchain-mistralai
huggingface-hub==0.24.5
+ # via fastembed
# via tokenizers
# via transformers
+humanfriendly==10.0
+ # via coloredlogs
+hyperframe==6.0.1
+ # via h2
idna==3.7
# via anyio
# via httpx
@@ -218,6 +238,7 @@ langsmith==0.1.121
# via langchain-core
loguru==0.7.2
# via burr
+ # via fastembed
lxml==5.3.0
# via free-proxy
markdown-it-py==3.0.0
@@ -236,8 +257,12 @@ minify-html==0.15.0
# via scrapegraphai
mistral-common==1.4.1
# via scrapegraphai
+mmh3==4.1.0
+ # via fastembed
mpire==2.10.2
# via semchunk
+mpmath==1.3.0
+ # via sympy
multidict==6.0.5
# via aiohttp
# via yarl
@@ -249,19 +274,27 @@ narwhals==1.3.0
# via altair
numpy==1.26.4
# via contourpy
+ # via fastembed
# via langchain
# via langchain-aws
# via langchain-community
# via matplotlib
+ # via onnx
+ # via onnxruntime
# via opencv-python-headless
# via pandas
# via pyarrow
# via pydeck
+ # via qdrant-client
# via sf-hamilton
# via streamlit
# via transformers
ollama==0.3.2
# via langchain-ollama
+onnx==1.17.0
+ # via fastembed
+onnxruntime==1.19.2
+ # via fastembed
openai==1.40.3
# via burr
# via langchain-openai
@@ -275,6 +308,7 @@ packaging==24.1
# via langchain-core
# via marshmallow
# via matplotlib
+ # via onnxruntime
# via pytest
# via sphinx
# via streamlit
@@ -284,6 +318,7 @@ pandas==2.2.2
# via sf-hamilton
# via streamlit
pillow==10.4.0
+ # via fastembed
# via matplotlib
# via mistral-common
# via streamlit
@@ -294,6 +329,8 @@ playwright==1.45.1
# via undetected-playwright
pluggy==1.5.0
# via pytest
+portalocker==2.10.1
+ # via qdrant-client
proto-plus==1.24.0
# via google-ai-generativelanguage
# via google-api-core
@@ -303,6 +340,9 @@ protobuf==4.25.4
# via google-generativeai
# via googleapis-common-protos
# via grpcio-status
+ # via grpcio-tools
+ # via onnx
+ # via onnxruntime
# via proto-plus
# via streamlit
pyarrow==17.0.0
@@ -326,6 +366,7 @@ pydantic==2.8.2
# via mistral-common
# via openai
# via pydantic-settings
+ # via qdrant-client
pydantic-core==2.20.1
# via pydantic
pydantic-settings==2.5.2
@@ -343,6 +384,8 @@ pylint==3.2.6
pyparsing==3.1.2
# via httplib2
# via matplotlib
+pystemmer==2.2.0.1
+ # via fastembed
pytest==8.0.0
# via pytest-mock
pytest-mock==3.14.0
@@ -361,6 +404,8 @@ pyyaml==6.0.2
# via langchain-community
# via langchain-core
# via transformers
+qdrant-client==1.11.3
+ # via scrapegraphai
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
@@ -369,6 +414,7 @@ regex==2024.7.24
# via transformers
requests==2.32.3
# via burr
+ # via fastembed
# via free-proxy
# via google-api-core
# via huggingface-hub
@@ -395,6 +441,8 @@ semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0
# via mistral-common
+setuptools==75.1.0
+ # via grpcio-tools
sf-hamilton==1.73.1
# via burr
six==1.16.0
@@ -406,6 +454,7 @@ sniffio==1.3.1
# via httpx
# via openai
snowballstemmer==2.2.0
+ # via fastembed
# via sphinx
soupsieve==2.5
# via beautifulsoup4
@@ -434,6 +483,8 @@ starlette==0.37.2
# via fastapi
streamlit==1.37.1
# via burr
+sympy==1.13.3
+ # via onnxruntime
tenacity==8.5.0
# via langchain
# via langchain-community
@@ -444,6 +495,7 @@ tiktoken==0.7.0
# via mistral-common
# via scrapegraphai
tokenizers==0.19.1
+ # via fastembed
# via langchain-mistralai
# via transformers
toml==0.10.2
@@ -456,6 +508,7 @@ tomlkit==0.13.0
tornado==6.4.1
# via streamlit
tqdm==4.66.5
+ # via fastembed
# via google-generativeai
# via huggingface-hub
# via mpire
@@ -495,6 +548,7 @@ uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.19
# via botocore
+ # via qdrant-client
# via requests
uvicorn==0.30.5
# via burr
diff --git a/requirements.lock b/requirements.lock
index 84e25a0f..8949648a 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -41,6 +41,8 @@ certifi==2024.7.4
# via requests
charset-normalizer==3.3.2
# via requests
+coloredlogs==15.0.1
+ # via onnxruntime
dataclasses-json==0.6.7
# via langchain-community
dill==0.3.8
@@ -49,9 +51,13 @@ distro==1.9.0
# via openai
exceptiongroup==1.2.2
# via anyio
+fastembed==0.3.6
+ # via scrapegraphai
filelock==3.15.4
# via huggingface-hub
# via transformers
+flatbuffers==24.3.25
+ # via onnxruntime
free-proxy==1.1.1
# via scrapegraphai
frozenlist==1.4.1
@@ -87,10 +93,18 @@ greenlet==3.0.3
grpcio==1.65.1
# via google-api-core
# via grpcio-status
+ # via grpcio-tools
+ # via qdrant-client
grpcio-status==1.62.2
# via google-api-core
+grpcio-tools==1.62.3
+ # via qdrant-client
h11==0.14.0
# via httpcore
+h2==4.1.0
+ # via httpx
+hpack==4.0.0
+ # via h2
html2text==2024.2.26
# via scrapegraphai
httpcore==1.0.5
@@ -103,11 +117,17 @@ httpx==0.27.0
# via langsmith
# via ollama
# via openai
+ # via qdrant-client
httpx-sse==0.4.0
# via langchain-mistralai
huggingface-hub==0.24.1
+ # via fastembed
# via tokenizers
# via transformers
+humanfriendly==10.0
+ # via coloredlogs
+hyperframe==6.0.1
+ # via h2
idna==3.7
# via anyio
# via httpx
@@ -156,6 +176,8 @@ langsmith==0.1.121
# via langchain
# via langchain-community
# via langchain-core
+loguru==0.7.2
+ # via fastembed
lxml==5.2.2
# via free-proxy
marshmallow==3.21.3
@@ -164,8 +186,12 @@ minify-html==0.15.0
# via scrapegraphai
mistral-common==1.4.1
# via scrapegraphai
+mmh3==4.1.0
+ # via fastembed
mpire==2.10.2
# via semchunk
+mpmath==1.3.0
+ # via sympy
multidict==6.0.5
# via aiohttp
# via yarl
@@ -174,14 +200,22 @@ multiprocess==0.70.16
mypy-extensions==1.0.0
# via typing-inspect
numpy==1.26.4
+ # via fastembed
# via langchain
# via langchain-aws
# via langchain-community
+ # via onnx
+ # via onnxruntime
# via opencv-python-headless
# via pandas
+ # via qdrant-client
# via transformers
ollama==0.3.2
# via langchain-ollama
+onnx==1.17.0
+ # via fastembed
+onnxruntime==1.19.2
+ # via fastembed
openai==1.41.0
# via langchain-openai
opencv-python-headless==4.10.0.84
@@ -192,14 +226,18 @@ packaging==24.1
# via huggingface-hub
# via langchain-core
# via marshmallow
+ # via onnxruntime
# via transformers
pandas==2.2.2
# via scrapegraphai
pillow==10.4.0
+ # via fastembed
# via mistral-common
playwright==1.45.1
# via scrapegraphai
# via undetected-playwright
+portalocker==2.10.1
+ # via qdrant-client
proto-plus==1.24.0
# via google-ai-generativelanguage
# via google-api-core
@@ -209,6 +247,9 @@ protobuf==4.25.3
# via google-generativeai
# via googleapis-common-protos
# via grpcio-status
+ # via grpcio-tools
+ # via onnx
+ # via onnxruntime
# via proto-plus
pyasn1==0.6.0
# via pyasn1-modules
@@ -226,6 +267,7 @@ pydantic==2.8.2
# via mistral-common
# via openai
# via pydantic-settings
+ # via qdrant-client
pydantic-core==2.20.1
# via pydantic
pydantic-settings==2.5.2
@@ -236,6 +278,8 @@ pygments==2.18.0
# via mpire
pyparsing==3.1.2
# via httplib2
+pystemmer==2.2.0.1
+ # via fastembed
python-dateutil==2.9.0.post0
# via botocore
# via pandas
@@ -250,6 +294,8 @@ pyyaml==6.0.1
# via langchain-community
# via langchain-core
# via transformers
+qdrant-client==1.11.3
+ # via scrapegraphai
referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
@@ -257,6 +303,7 @@ regex==2024.5.15
# via tiktoken
# via transformers
requests==2.32.3
+ # via fastembed
# via free-proxy
# via google-api-core
# via huggingface-hub
@@ -279,17 +326,23 @@ semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0
# via mistral-common
+setuptools==75.1.0
+ # via grpcio-tools
six==1.16.0
# via python-dateutil
sniffio==1.3.1
# via anyio
# via httpx
# via openai
+snowballstemmer==2.2.0
+ # via fastembed
soupsieve==2.5
# via beautifulsoup4
sqlalchemy==2.0.31
# via langchain
# via langchain-community
+sympy==1.13.3
+ # via onnxruntime
tenacity==8.5.0
# via langchain
# via langchain-community
@@ -299,9 +352,11 @@ tiktoken==0.7.0
# via mistral-common
# via scrapegraphai
tokenizers==0.19.1
+ # via fastembed
# via langchain-mistralai
# via transformers
tqdm==4.66.4
+ # via fastembed
# via google-generativeai
# via huggingface-hub
# via mpire
@@ -333,6 +388,7 @@ uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.19
# via botocore
+ # via qdrant-client
# via requests
yarl==1.9.4
# via aiohttp
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
index efd6bd7e..b5ffcc47 100644
--- a/scrapegraphai/graphs/__init__.py
+++ b/scrapegraphai/graphs/__init__.py
@@ -26,3 +26,4 @@
from .screenshot_scraper_graph import ScreenshotScraperGraph
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
from .code_generator_graph import CodeGeneratorGraph
+from .depth_search_graph import DepthSearchGraph
diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py
new file mode 100644
index 00000000..13b39129
--- /dev/null
+++ b/scrapegraphai/graphs/depth_search_graph.py
@@ -0,0 +1,151 @@
+"""
+... Module
+"""
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..utils.save_code_to_file import save_code_to_file
+from ..nodes import (
+ FetchNodeLevelK,
+ ParseNodeDepthK,
+ DescriptionNode,
+ RAGNode,
+ GenerateAnswerNodeKLevel
+)
+
+class DepthSearchGraph(AbstractGraph):
+ """
+ CodeGeneratorGraph is a script generator pipeline that generates
+ the function extract_data(html: str) -> dict() for
+ extracting the wanted information from a HTML page. The
+ code generated is in Python and uses the library BeautifulSoup.
+ It requires a user prompt, a source URL, and an output schema.
+
+ Attributes:
+ prompt (str): The prompt for the graph.
+ source (str): The source of the graph.
+ config (dict): Configuration parameters for the graph.
+ schema (BaseModel): The schema for the graph output.
+ llm_model: An instance of a language model client, configured for generating answers.
+ embedder_model: An instance of an embedding model client,
+ configured for generating embeddings.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+ headless (bool): A flag indicating whether to run the graph in headless mode.
+ library (str): The library used for web scraping (beautiful soup).
+
+ Args:
+ prompt (str): The prompt for the graph.
+ source (str): The source of the graph.
+ config (dict): Configuration parameters for the graph.
+ schema (BaseModel): The schema for the graph output.
+
+ Example:
+ >>> code_gen = CodeGeneratorGraph(
+ ... "List me all the attractions in Chioggia.",
+ ... "https://en.wikipedia.org/wiki/Chioggia",
+ ... {"llm": {"model": "openai/gpt-3.5-turbo"}}
+ ... )
+ >>> result = code_gen.run()
+ )
+ """
+
+ def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+
+ super().__init__(prompt, config, source, schema)
+
+ self.input_key = "url" if source.startswith("http") else "local_dir"
+
+ def _create_graph(self) -> BaseGraph:
+ """
+ Creates the graph of nodes representing the workflow for web scraping.
+
+ Returns:
+ BaseGraph: A graph instance representing the web scraping workflow.
+ """
+
+ fetch_node_k = FetchNodeLevelK(
+ input="url| local_dir",
+ output=["docs"],
+ node_config={
+ "loader_kwargs": self.config.get("loader_kwargs", {}),
+ "force": self.config.get("force", False),
+ "cut": self.config.get("cut", True),
+ "browser_base": self.config.get("browser_base"),
+ "depth": self.config.get("depth", 1),
+ "only_inside_links": self.config.get("only_inside_links", False)
+ }
+ )
+
+ parse_node_k = ParseNodeDepthK(
+ input="docs",
+ output=["docs"],
+ node_config={
+ "verbose": self.config.get("verbose", False)
+ }
+ )
+
+ description_node = DescriptionNode(
+ input="docs",
+ output=["docs"],
+ node_config={
+ "llm_model": self.llm_model,
+ "verbose": self.config.get("verbose", False),
+ "cache_path": self.config.get("cache_path", False)
+ }
+ )
+
+ rag_node = RAGNode (
+ input="docs",
+ output=["vectorial_db"],
+ node_config={
+ "llm_model": self.llm_model,
+ "embedder_model": self.config.get("embedder_model", False),
+ "verbose": self.config.get("verbose", False),
+ }
+ )
+
+ generate_answer_k = GenerateAnswerNodeKLevel(
+ input="vectorial_db",
+ output=["answer"],
+ node_config={
+ "llm_model": self.llm_model,
+ "embedder_model": self.config.get("embedder_model", False),
+ "verbose": self.config.get("verbose", False),
+ }
+
+ )
+
+ return BaseGraph(
+ nodes=[
+ fetch_node_k,
+ parse_node_k,
+ description_node,
+ rag_node,
+ generate_answer_k
+ ],
+ edges=[
+ (fetch_node_k, parse_node_k),
+ (parse_node_k, description_node),
+ (description_node, rag_node),
+ (rag_node, generate_answer_k)
+ ],
+ entry_point=fetch_node_k,
+ graph_name=self.__class__.__name__
+ )
+
+ def run(self) -> str:
+ """
+ Executes the scraping process and returns the generated code.
+
+ Returns:
+ str: The generated code.
+ """
+
+ inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+ self.final_state, self.execution_info = self.graph.execute(inputs)
+
+ docs = self.final_state.get("answer", "No answer")
+
+ return docs
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index ec16c48e..edb195a5 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -28,3 +28,7 @@
from .generate_code_node import GenerateCodeNode
from .search_node_with_context import SearchLinksWithContext
from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodeLevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
+from .parse_node_depth_k import ParseNodeDepthK
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..4201a61d
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,67 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from tqdm import tqdm
+from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnableParallel
+from .base_node import BaseNode
+from ..prompts.description_node_prompts import DESCRIPTION_NODE_PROMPT
+
+class DescriptionNode(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "DESCRIPTION",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+ self.llm_model = node_config["llm_model"]
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+ self.cache_path = node_config.get("cache_path", False)
+
+ def execute(self, state: dict) -> dict:
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ docs = [elem for elem in state.get("docs")]
+
+ chains_dict = {}
+
+ for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)):
+ prompt = PromptTemplate(
+ template=DESCRIPTION_NODE_PROMPT,
+ partial_variables={"content": chunk.get("document")}
+ )
+ chain_name = f"chunk{i+1}"
+ chains_dict[chain_name] = prompt | self.llm_model
+
+ async_runner = RunnableParallel(**chains_dict)
+ batch_results = async_runner.invoke({})
+
+
+ for i in range(1, len(docs)+1):
+ docs[i-1]["summary"] = batch_results.get(f"chunk{i}").content
+
+ state.update({self.output[0]: docs})
+
+ return state
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..d321b33c
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,214 @@
+from typing import List, Optional
+from .base_node import BaseNode
+from ..docloaders import ChromiumLoader
+from ..utils.cleanup_html import cleanup_html
+from ..utils.convert_to_md import convert_to_md
+from langchain_core.documents import Document
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urljoin
+
+class FetchNodeLevelK(BaseNode):
+ """
+ A node responsible for fetching the HTML content of a specified URL and all its sub-links
+ recursively up to a certain level of hyperlink the graph. This content is then used to update
+ the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously
+ (with proxy protection).
+
+ Attributes:
+ embedder_model: An optional model for embedding the fetched content.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+ cache_path (str): Path to cache fetched content.
+ headless (bool): Whether to run the Chromium browser in headless mode.
+ loader_kwargs (dict): Additional arguments for the content loader.
+ browser_base (dict): Optional configuration for the browser base API.
+ depth (int): Maximum depth of hyperlink graph traversal.
+ only_inside_links (bool): Whether to fetch only internal links.
+ min_input_len (int): Minimum required length of input data.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "FetchLevelK",
+ ):
+ """
+ Initializes the FetchNodeLevelK instance.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (Optional[dict]): Additional configuration for the node.
+ node_name (str): The name of the node (default is "FetchLevelK").
+ """
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = node_config.get("verbose", False) if node_config else False
+ self.cache_path = node_config.get("cache_path", False)
+ self.headless = node_config.get("headless", True) if node_config else True
+ self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
+ self.browser_base = node_config.get("browser_base", None)
+ self.depth = node_config.get("depth", 1) if node_config else 1
+ self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
+ self.min_input_len = 1
+
+ def execute(self, state: dict) -> dict:
+ """
+ Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
+ recursively, then updates the graph's state with the fetched content.
+
+ Args:
+ state (dict): The current state of the graph.
+
+ Returns:
+ dict: The updated state with a new output key containing the fetched HTML content.
+
+ Raises:
+ KeyError: If the input key is not found in the state.
+ """
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ input_keys = self.get_input_keys(state)
+ input_data = [state[key] for key in input_keys]
+ source = input_data[0]
+
+ documents = [{"source": source}]
+ loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {}
+
+ for _ in range(self.depth):
+ documents = self.obtain_content(documents, loader_kwargs)
+
+ filtered_documents = [doc for doc in documents if 'document' in doc]
+ state.update({self.output[0]: filtered_documents})
+ return state
+
+ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
+ """
+ Fetches the HTML content of a given source URL.
+
+ Args:
+ source (str): The URL to fetch content from.
+ loader_kwargs (dict): Additional arguments for the content loader.
+
+ Returns:
+ Optional[str]: The fetched HTML content or None if fetching failed.
+ """
+ self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+
+ if self.browser_base is not None:
+ try:
+ from ..docloaders.browser_base import browser_base_fetch
+ except ImportError:
+ raise ImportError("""The browserbase module is not installed.
+ Please install it using `pip install browserbase`.""")
+
+ data = browser_base_fetch(self.browser_base.get("api_key"),
+ self.browser_base.get("project_id"), [source])
+ document = [Document(page_content=content, metadata={"source": source}) for content in data]
+ else:
+ loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
+ document = loader.load()
+ return document
+
+ def extract_links(self, html_content: str) -> list:
+ """
+ Extracts all hyperlinks from the HTML content.
+
+ Args:
+ html_content (str): The HTML content to extract links from.
+
+ Returns:
+ list: A list of extracted hyperlinks.
+ """
+ soup = BeautifulSoup(html_content, 'html.parser')
+ links = [link['href'] for link in soup.find_all('a', href=True)]
+ self.logger.info(f"Extracted {len(links)} links.")
+ return links
+
+ def get_full_links(self, base_url: str, links: list) -> list:
+ """
+ Converts relative URLs to full URLs based on the base URL.
+
+ Args:
+ base_url (str): The base URL for resolving relative links.
+ links (list): A list of links to convert.
+
+ Returns:
+ list: A list of full URLs.
+ """
+ full_links = []
+ for link in links:
+ if self.only_inside_links and link.startswith("http"):
+ continue
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ full_links.append(full_link)
+ return full_links
+
+ def obtain_content(self, documents: List, loader_kwargs) -> List:
+ """
+ Iterates through documents, fetching and updating content recursively.
+
+ Args:
+ documents (List): A list of documents containing the source URLs.
+ loader_kwargs (dict): Additional arguments for the content loader.
+
+ Returns:
+ List: The updated list of documents with fetched content.
+ """
+ new_documents = []
+ for doc in documents:
+ source = doc['source']
+ if 'document' not in doc:
+ document = self.fetch_content(source, loader_kwargs)
+
+ if not document or not document[0].page_content.strip():
+ self.logger.warning(f"Failed to fetch content for {source}")
+ documents.remove(doc)
+ continue
+
+ doc['document'] = document
+ links = self.extract_links(doc['document'][0].page_content)
+ full_links = self.get_full_links(source, links)
+
+ for link in full_links:
+ if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
+ new_documents.append({"source": link})
+
+ documents.extend(new_documents)
+ return documents
+
+ def process_links(self, base_url: str, links: list,
+ loader_kwargs, depth: int, current_depth: int = 1) -> dict:
+ """
+ Processes a list of links recursively up to a given depth.
+
+ Args:
+ base_url (str): The base URL for resolving relative links.
+ links (list): A list of links to process.
+ loader_kwargs (dict): Additional arguments for the content loader.
+ depth (int): The maximum depth for recursion.
+ current_depth (int): The current depth of recursion (default is 1).
+
+ Returns:
+ dict: A dictionary containing processed link content.
+ """
+ content_dict = {}
+ for idx, link in enumerate(links, start=1):
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ self.logger.info(f"Processing link {idx}: {full_link}")
+ link_content = self.fetch_content(full_link, loader_kwargs)
+
+ if current_depth < depth:
+ new_links = self.extract_links(link_content)
+ content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1))
+ else:
+ self.logger.warning(f"Failed to fetch content for {full_link}")
+ return content_dict
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
index 15686ec1..d5034a1e 100644
--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,3 +1,6 @@
+"""
+generate_answer_node module
+"""
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
@@ -15,6 +18,26 @@
)
class GenerateAnswerNode(BaseNode):
+ """
+ Initializes the GenerateAnswerNode class.
+
+ Args:
+ input (str): The input data type for the node.
+ output (List[str]): The output data type(s) for the node.
+ node_config (Optional[dict]): Configuration dictionary for the node,
+ which includes the LLM model, verbosity, schema, and other settings.
+ Defaults to None.
+ node_name (str): The name of the node. Defaults to "GenerateAnswer".
+
+ Attributes:
+ llm_model: The language model specified in the node configuration.
+ verbose (bool): Whether verbose mode is enabled.
+ force (bool): Whether to force certain behaviors, overriding defaults.
+ script_creator (bool): Whether the node is in script creation mode.
+ is_md_scraper (bool): Whether the node is scraping markdown data.
+ additional_info (Optional[str]): Any additional information to be
+ included in the prompt templates.
+ """
def __init__(
self,
input: str,
@@ -100,7 +123,9 @@ def execute(self, state: dict) -> dict:
prompt = PromptTemplate(
template=template_chunks_prompt,
input_variables=["question"],
- partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}
+ partial_variables={"context": chunk,
+ "chunk_id": i + 1,
+ "format_instructions": format_instructions}
)
chain_name = f"chunk{i+1}"
chains_dict[chain_name] = prompt | self.llm_model
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
new file mode 100644
index 00000000..291109f2
--- /dev/null
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -0,0 +1,150 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from tqdm import tqdm
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_aws import ChatBedrock
+from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser
+from .base_node import BaseNode
+from ..prompts import (
+ TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE,
+ TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD
+)
+
+class GenerateAnswerNodeKLevel(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "GANLK",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = node_config.get("verbose", False)
+ self.force = node_config.get("force", False)
+ self.script_creator = node_config.get("script_creator", False)
+ self.is_md_scraper = node_config.get("is_md_scraper", False)
+ self.additional_info = node_config.get("additional_info")
+
+ def execute(self, state: dict) -> dict:
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ user_prompt = state.get("user_prompt")
+
+ if self.node_config.get("schema", None) is not None:
+ if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
+ self.llm_model = self.llm_model.with_structured_output(
+ schema=self.node_config["schema"]
+ )
+ output_parser = get_structured_output_parser(self.node_config["schema"])
+ format_instructions = "NA"
+ else:
+ if not isinstance(self.llm_model, ChatBedrock):
+ output_parser = get_pydantic_output_parser(self.node_config["schema"])
+ format_instructions = output_parser.get_format_instructions()
+ else:
+ output_parser = None
+ format_instructions = ""
+ else:
+ if not isinstance(self.llm_model, ChatBedrock):
+ output_parser = JsonOutputParser()
+ format_instructions = output_parser.get_format_instructions()
+ else:
+ output_parser = None
+ format_instructions = ""
+
+ if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \
+ and not self.script_creator \
+ or self.force \
+ and not self.script_creator or self.is_md_scraper:
+ template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD
+ template_chunks_prompt = TEMPLATE_CHUNKS_MD
+ template_merge_prompt = TEMPLATE_MERGE_MD
+ else:
+ template_no_chunks_prompt = TEMPLATE_NO_CHUNKS
+ template_chunks_prompt = TEMPLATE_CHUNKS
+ template_merge_prompt = TEMPLATE_MERGE
+
+ if self.additional_info is not None:
+ template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
+ template_chunks_prompt = self.additional_info + template_chunks_prompt
+ template_merge_prompt = self.additional_info + template_merge_prompt
+
+ client = state["vectorial_db"]
+
+ if state.get("embeddings"):
+ import openai
+ openai_client = openai.Client()
+
+ answer_db = client.search(
+ collection_name="collection",
+ query_vector=openai_client.embeddings.create(
+ input=["What is the best to use for vector search scaling?"],
+ model=state.get("embeddings").get("model"),
+ )
+ .data[0]
+ .embedding,
+ )
+ else:
+ answer_db = client.query(
+ collection_name="vectorial_collection",
+ query_text=user_prompt
+ )
+
+ chains_dict = {}
+ elems =[state.get("docs")[elem.id-1] for elem in answer_db if elem.score>0.5]
+
+ for i, chunk in enumerate(tqdm(elems,
+ desc="Processing chunks", disable=not self.verbose)):
+ prompt = PromptTemplate(
+ template=template_chunks_prompt,
+ input_variables=["format_instructions"],
+ partial_variables={"context": chunk.get("document"),
+ "chunk_id": i + 1,
+ }
+ )
+ chain_name = f"chunk{i+1}"
+ chains_dict[chain_name] = prompt | self.llm_model
+
+ async_runner = RunnableParallel(**chains_dict)
+ batch_results = async_runner.invoke({"format_instructions": user_prompt})
+
+ merge_prompt = PromptTemplate(
+ template=template_merge_prompt,
+ input_variables=["context", "question"],
+ partial_variables={"format_instructions": format_instructions}
+ )
+
+ merge_chain = merge_prompt | self.llm_model
+ if output_parser:
+ merge_chain = merge_chain | output_parser
+ answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+
+ state["answer"] = answer
+
+ return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index cc72aaf4..746b10a5 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,6 @@
from .base_node import BaseNode
from jsonschema import validate, ValidationError
-
class GenerateCodeNode(BaseNode):
"""
A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict:
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
- RuntimeError: If the maximum number of iterations is
+ RuntimeError: If the maximum number of iterations is
reached without obtaining the desired code.
"""
@@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict:
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
state = self.semantic_comparison_loop(state)
if state["errors"]["semantic"]:
- continue
+ continue
break
if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict:
state["errors"]["syntax"] = [syntax_message]
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
analysis = syntax_focused_analysis(state, self.llm_model)
- self.logger.info(f"""--- (Regenerating Code
+ self.logger.info(f"""--- (Regenerating Code
to fix the Error) ---""")
- state["generated_code"] = syntax_focused_code_generation(state,
+ state["generated_code"] = syntax_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
@@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict:
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
analysis = execution_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
- state["generated_code"] = execution_focused_code_generation(state,
+ state["generated_code"] = execution_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def validation_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["validation"]):
- validation, errors = self.validate_dict(state["execution_result"],
+ validation, errors = self.validate_dict(state["execution_result"],
self.output_schema.schema())
if validation:
state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict:
def semantic_comparison_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["semantic"]):
- comparison_result = self.semantic_comparison(state["execution_result"],
+ comparison_result = self.semantic_comparison(state["execution_result"],
state["reference_answer"])
if comparison_result["are_semantically_equivalent"]:
state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code):
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
- result = extract_data(self.raw_html)
+ result = extract_data(self.raw_html)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema):
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
- errors = e.errors()
+ errors = [e.message]
return False, errors
diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py
new file mode 100644
index 00000000..6427b051
--- /dev/null
+++ b/scrapegraphai/nodes/parse_node_depth_k.py
@@ -0,0 +1,67 @@
+"""
+ParseNodeDepthK Module
+"""
+from typing import List, Optional
+from langchain_community.document_transformers import Html2TextTransformer
+from .base_node import BaseNode
+
+class ParseNodeDepthK(BaseNode):
+ """
+ A node responsible for parsing HTML content from a series of documents.
+
+ This node enhances the scraping workflow by allowing for targeted extraction of
+ content, thereby optimizing the processing of large HTML documents.
+
+ Attributes:
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "ParseNodeDepthK",
+ ):
+ super().__init__(node_name, "node", input, output, 1, node_config)
+
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+
+ def execute(self, state: dict) -> dict:
+ """
+ Executes the node's logic to parse the HTML documents content.
+
+ Args:
+ state (dict): The current state of the graph. The input keys will be used to fetch the
+ correct data from the state.
+
+ Returns:
+ dict: The updated state with the output key containing the parsed content chunks.
+
+ Raises:
+ KeyError: If the input keys are not found in the state, indicating that the
+ necessary information for parsing the content is missing.
+ """
+
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ input_keys = self.get_input_keys(state)
+ input_data = [state[key] for key in input_keys]
+
+ documents = input_data[0]
+
+ for doc in documents:
+ document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
+ doc["document"] = document_md[0].page_content
+
+ state.update({self.output[0]: documents})
+
+ return state
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 1174beee..b67c50e9 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -1,29 +1,10 @@
"""
RAGNode Module
"""
-import os
-import sys
from typing import List, Optional
-from langchain.docstore.document import Document
-from langchain.retrievers import ContextualCompressionRetriever
-from langchain.retrievers.document_compressors import (
- DocumentCompressorPipeline,
- EmbeddingsFilter,
-)
-from langchain_community.document_transformers import EmbeddingsRedundantFilter
-from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOllama
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_aws import BedrockEmbeddings, ChatBedrock
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
-from ..utils.logging import get_logger
from .base_node import BaseNode
-from ..helpers import models_tokens
-from ..models import DeepSeek
-
-optional_modules = {"langchain_anthropic", "langchain_fireworks",
- "langchain_groq", "langchain_google_vertexai"}
+from qdrant_client import QdrantClient
+from qdrant_client.models import PointStruct, VectorParams, Distance
class RAGNode(BaseNode):
"""
@@ -34,7 +15,6 @@ class RAGNode(BaseNode):
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
- embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
@@ -58,125 +38,62 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
- self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
- # Execution logic
- pass
+ self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+ if self.node_config.get("client_type") in ["memory", None]:
+ client = QdrantClient(":memory:")
+ elif self.node_config.get("client_type") == "local_db":
+ client = QdrantClient(path="path/to/db")
+ elif self.node_config.get("client_type") == "image":
+ client = QdrantClient(url="http://localhost:6333")
+ else:
+ raise ValueError("client_type provided not correct")
+
+ docs = [elem.get("summary") for elem in state.get("docs")]
+ ids = [i for i in range(1, len(state.get("docs"))+1)]
+
+ if state.get("embeddings"):
+ import openai
+ openai_client = openai.Client()
+
+ files = state.get("documents")
+
+ array_of_embeddings = []
+ i=0
- def _create_default_embedder(self, llm_config=None) -> object:
- """
- Create an embedding model instance based on the chosen llm model.
+ for file in files:
+ embeddings = openai_client.embeddings.create(input=file,
+ model=state.get("embeddings").get("model"))
+ i+=1
+ points = PointStruct(
+ id=i,
+ vector=embeddings,
+ payload={"text": file},
+ )
- Returns:
- object: An instance of the embedding model client.
+ array_of_embeddings.append(points)
- Raises:
- ValueError: If the model is not supported.
- """
+ collection_name = "collection"
- if isinstance(self.llm_model, ChatGoogleGenerativeAI):
- return GoogleGenerativeAIEmbeddings(
- google_api_key=llm_config["api_key"], model="models/embedding-001"
+ client.create_collection(
+ collection_name,
+ vectors_config=VectorParams(
+ size=1536,
+ distance=Distance.COSINE,
+ ),
)
- if isinstance(self.llm_model, ChatOpenAI):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
- base_url=self.llm_model.openai_api_base)
- elif isinstance(self.llm_model, DeepSeek):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
- elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
- return self.llm_model
- elif isinstance(self.llm_model, AzureChatOpenAI):
- return AzureOpenAIEmbeddings()
- elif isinstance(self.llm_model, ChatOllama):
- params = self.llm_model._lc_kwargs
- params.pop("streaming", None)
- params.pop("temperature", None)
- return OllamaEmbeddings(**params)
- elif isinstance(self.llm_model, ChatBedrock):
- return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
- elif all(key in sys.modules for key in optional_modules):
- if isinstance(self.llm_model, ChatFireworks):
- from langchain_fireworks import FireworksEmbeddings
- return FireworksEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatNVIDIA):
- from langchain_nvidia import NVIDIAEmbeddings
- return NVIDIAEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatHuggingFace):
- from langchain_huggingface import HuggingFaceEmbeddings
- return HuggingFaceEmbeddings(model=self.llm_model.model)
- if isinstance(self.llm_model, ChatVertexAI):
- from langchain_vertexai import VertexAIEmbeddings
- return VertexAIEmbeddings()
- else:
- raise ValueError("Embedding Model missing or not supported")
-
- def _create_embedder(self, embedder_config: dict) -> object:
- """
- Create an embedding model instance based on the configuration provided.
-
- Args:
- embedder_config (dict): Configuration parameters for the embedding model.
-
- Returns:
- object: An instance of the embedding model client.
-
- Raises:
- KeyError: If the model is not supported.
- """
- embedder_params = {**embedder_config}
- if "model_instance" in embedder_config:
- return embedder_params["model_instance"]
- if "openai" in embedder_params["model"]:
- return OpenAIEmbeddings(api_key=embedder_params["api_key"])
- if "azure" in embedder_params["model"]:
- return AzureOpenAIEmbeddings()
- if "ollama" in embedder_params["model"]:
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["ollama"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return OllamaEmbeddings(**embedder_params)
- if "gemini" in embedder_params["model"]:
- try:
- models_tokens["gemini"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
- if "bedrock" in embedder_params["model"]:
- embedder_params["model"] = embedder_params["model"].split("/")[-1]
- client = embedder_params.get("client", None)
- try:
- models_tokens["bedrock"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
- if all(key in sys.modules for key in optional_modules):
- if "hugging_face" in embedder_params["model"]:
- from langchain_huggingface import HuggingFaceEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["hugging_face"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return HuggingFaceEmbeddings(model=embedder_params["model"])
- elif "fireworks" in embedder_params["model"]:
- from langchain_fireworks import FireworksEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["fireworks"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return FireworksEmbeddings(model=embedder_params["model"])
- elif "nvidia" in embedder_params["model"]:
- from langchain_nvidia import NVIDIAEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["nvidia"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return NVIDIAEmbeddings(model=embedder_params["model"],
- nvidia_api_key=embedder_params["api_key"])
-
- raise ValueError("Model provided by the configuration not supported")
+ client.upsert(collection_name, points)
+
+ state["vectorial_db"] = client
+ return state
+
+ client.add(
+ collection_name="vectorial_collection",
+ documents=docs,
+ ids=ids
+ )
+
+ state["vectorial_db"] = client
+ return state
diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py
new file mode 100644
index 00000000..20df481a
--- /dev/null
+++ b/scrapegraphai/prompts/description_node_prompts.py
@@ -0,0 +1,10 @@
+"""
+description node prompts
+"""
+
+DESCRIPTION_NODE_PROMPT = """
+You are a scraper and you have just scraped the
+following content from a website. \n
+Please provide a description summary of maximum of 20 words
+Content of the website: {content}
+"""
\ No newline at end of file
diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py
index 7c098fe2..1b336fb4 100644
--- a/scrapegraphai/prompts/generate_answer_node_prompts.py
+++ b/scrapegraphai/prompts/generate_answer_node_prompts.py
@@ -2,6 +2,7 @@
Generate answer node prompts
"""
+
TEMPLATE_CHUNKS_MD = """
You are a website scraper and you have just scraped the
following content from a website converted in markdown format.
@@ -32,6 +33,7 @@
You are now asked to answer a user question about the content you have scraped.\n
You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n
Make sure that if a maximum number of items is specified in the instructions that you get that maximum number and do not exceed it. \n
+The structure should be coherent. \n
Make sure the output format is a valid JSON and does not contain errors. \n
OUTPUT INSTRUCTIONS: {format_instructions}\n
USER QUESTION: {question}\n
diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py
new file mode 100644
index 00000000..21703b7b
--- /dev/null
+++ b/scrapegraphai/utils/1_manual.py
@@ -0,0 +1,92 @@
+import requests
+import logging
+import time
+from urllib.parse import quote, urljoin
+from typing import Optional
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+import os
+import json
+import markdownify
+
+load_dotenv()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]:
+ encoded_url = quote(target_url)
+ url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0"
+
+ for attempt in range(max_retries):
+ try:
+ response = requests.get(url)
+ if response.status_code == 200:
+ logging.info(f"Successfully fetched content from {target_url}")
+ return response.text
+ logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...")
+ except requests.RequestException as e:
+ logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...")
+ time.sleep(retry_delay)
+
+ logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.")
+ return None
+
+def extract_links(html_content: str) -> list:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ links = [link['href'] for link in soup.find_all('a', href=True)]
+ logging.info(f"Extracted {len(links)} links.")
+ return links
+
+def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict:
+ content_dict = {}
+ for idx, link in enumerate(links, start=1):
+ full_link = link if link.startswith("http") else urljoin(base_url, link)
+ logging.info(f"Processing link {idx}: {full_link}")
+ link_content = fetch_content(token, full_link)
+ if link_content:
+ markdown_content = markdownify.markdownify(link_content, heading_style="ATX")
+ content_dict[full_link] = markdown_content
+ save_content_to_json(content_dict, idx)
+
+ if current_depth < depth:
+ new_links = extract_links(link_content)
+ content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1))
+ else:
+ logging.warning(f"Failed to fetch content for {full_link}")
+ return content_dict
+
+def save_content_to_json(content_dict: dict, idx: int):
+ if not os.path.exists("downloaded_pages"):
+ os.makedirs("downloaded_pages")
+
+ file_name = f"scraped_content_{idx}.json"
+ file_path = os.path.join("downloaded_pages", file_name)
+
+ with open(file_path, "w", encoding="utf-8") as json_file:
+ json.dump(content_dict, json_file, ensure_ascii=False, indent=4)
+
+ logging.info(f"Content saved to {file_path}")
+
+if __name__ == "__main__":
+ token = os.getenv("TOKEN")
+ target_url = "https://www.wired.com"
+ depth = 2
+
+ if not token or not target_url:
+ logging.error("Please set the TOKEN and TARGET_URL environment variables.")
+ exit(1)
+
+ html_content = fetch_content(token, target_url)
+
+ if html_content:
+ links = extract_links(html_content)
+ logging.info("Links found:")
+ for link in links:
+ logging.info(link)
+
+ content_dict = process_links(token, target_url, links, depth)
+ for link, content in content_dict.items():
+ logging.info(f"Link: {link}")
+ logging.info(f"Content: {content[:500]}...")
+ else:
+ logging.error("Failed to fetch the content.")