diff --git a/README.md b/README.md index cf437203..5d79bf55 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library: - More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. - -This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. -```bash -pip install scrapegraphai[other-language-models] + This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. + ```bash + pip install scrapegraphai[other-language-models] ``` - Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz. @@ -55,23 +54,9 @@ pip install scrapegraphai[other-language-models] pip install scrapegraphai[more-browser-options] ``` -- faiss Options: this group includes faiss integration - - ```bash - pip install scrapegraphai[faiss-cpu] - ``` - - -### Installing "More Browser Options" - -This group includes an ocr scraper for websites -```bash -pip install scrapegraphai[screenshot_scraper] -``` - ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/examples/anthropic/csv_scraper_haiku.py b/examples/anthropic/csv_scraper_anthropic.py similarity index 100% rename from examples/anthropic/csv_scraper_haiku.py rename to examples/anthropic/csv_scraper_anthropic.py diff --git a/examples/anthropic/csv_scraper_graph_multi_haiku.py b/examples/anthropic/csv_scraper_graph_multi_anthropic.py similarity index 100% rename from examples/anthropic/csv_scraper_graph_multi_haiku.py rename to examples/anthropic/csv_scraper_graph_multi_anthropic.py diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_anthropic.py similarity index 100% rename from examples/anthropic/custom_graph_haiku.py rename to examples/anthropic/custom_graph_anthropic.py diff --git a/examples/anthropic/depth_search_graph_anthropic.py b/examples/anthropic/depth_search_graph_anthropic.py new file mode 100644 index 00000000..8cac7bea --- /dev/null +++ b/examples/anthropic/depth_search_graph_anthropic.py @@ -0,0 +1,28 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "openai/gpt-4o-mini", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/anthropic/json_scraper_haiku.py b/examples/anthropic/json_scraper_anthropic.py similarity index 100% rename from examples/anthropic/json_scraper_haiku.py rename to examples/anthropic/json_scraper_anthropic.py diff --git a/examples/anthropic/json_scraper_multi_haiku.py b/examples/anthropic/json_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/json_scraper_multi_haiku.py rename to examples/anthropic/json_scraper_multi_anthropic.py diff --git a/examples/anthropic/pdf_scraper_graph_haiku.py b/examples/anthropic/pdf_scraper_graph_anthropic.py similarity index 100% rename from examples/anthropic/pdf_scraper_graph_haiku.py rename to examples/anthropic/pdf_scraper_graph_anthropic.py diff --git a/examples/anthropic/pdf_scraper_multi_haiku.py b/examples/anthropic/pdf_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/pdf_scraper_multi_haiku.py rename to examples/anthropic/pdf_scraper_multi_anthropic.py diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_anthropic.py similarity index 100% rename from examples/anthropic/rate_limit_haiku.py rename to examples/anthropic/rate_limit_anthropic.py diff --git a/examples/anthropic/scrape_plain_text_haiku.py b/examples/anthropic/scrape_plain_text_anthropic.py similarity index 100% rename from examples/anthropic/scrape_plain_text_haiku.py rename to examples/anthropic/scrape_plain_text_anthropic.py diff --git a/examples/anthropic/script_generator_haiku.py b/examples/anthropic/script_generator_anthropic.py similarity index 100% rename from examples/anthropic/script_generator_haiku.py rename to examples/anthropic/script_generator_anthropic.py diff --git a/examples/anthropic/script_multi_generator_haiku.py b/examples/anthropic/script_multi_generator_anthropic.py similarity index 100% rename from examples/anthropic/script_multi_generator_haiku.py rename to examples/anthropic/script_multi_generator_anthropic.py diff --git a/examples/anthropic/search_graph_haiku.py b/examples/anthropic/search_graph_anthropic.py similarity index 100% rename from examples/anthropic/search_graph_haiku.py rename to examples/anthropic/search_graph_anthropic.py diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_anthropic.py similarity index 100% rename from examples/anthropic/search_graph_schema_haiku.py rename to examples/anthropic/search_graph_schema_anthropic.py diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_anthropic.py similarity index 100% rename from examples/anthropic/search_link_graph_haiku.py rename to examples/anthropic/search_link_graph_anthropic.py diff --git a/examples/anthropic/smart_scraper_haiku.py b/examples/anthropic/smart_scraper_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_haiku.py rename to examples/anthropic/smart_scraper_anthropic.py diff --git a/examples/anthropic/smart_scraper_multi_haiku.py b/examples/anthropic/smart_scraper_multi_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_multi_haiku.py rename to examples/anthropic/smart_scraper_multi_anthropic.py diff --git a/examples/anthropic/smart_scraper_multi_concat_haiku.py b/examples/anthropic/smart_scraper_multi_concat_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_multi_concat_haiku.py rename to examples/anthropic/smart_scraper_multi_concat_anthropic.py diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_anthropic.py similarity index 100% rename from examples/anthropic/smart_scraper_schema_haiku.py rename to examples/anthropic/smart_scraper_schema_anthropic.py diff --git a/examples/anthropic/xml_scraper_haiku.py b/examples/anthropic/xml_scraper_anthropic.py similarity index 100% rename from examples/anthropic/xml_scraper_haiku.py rename to examples/anthropic/xml_scraper_anthropic.py diff --git a/examples/anthropic/xml_scraper_graph_multi_haiku.py b/examples/anthropic/xml_scraper_graph_multi_anthropic.py similarity index 100% rename from examples/anthropic/xml_scraper_graph_multi_haiku.py rename to examples/anthropic/xml_scraper_graph_multi_anthropic.py diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py index ad48933f..4bad1b0d 100644 --- a/examples/azure/code_generator_graph_azure.py +++ b/examples/azure/code_generator_graph_azure.py @@ -28,7 +28,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False, diff --git a/examples/azure/csv_scraper_azure.py b/examples/azure/csv_scraper_azure.py index efc99758..272527b3 100644 --- a/examples/azure/csv_scraper_azure.py +++ b/examples/azure/csv_scraper_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/csv_scraper_graph_multi_azure.py b/examples/azure/csv_scraper_graph_multi_azure.py index d9160c40..cccbf88e 100644 --- a/examples/azure/csv_scraper_graph_multi_azure.py +++ b/examples/azure/csv_scraper_graph_multi_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/depth_search_graph_azure.py b/examples/azure/depth_search_graph_azure.py new file mode 100644 index 00000000..88b2cd1b --- /dev/null +++ b/examples/azure/depth_search_graph_azure.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-4o", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/azure/json_scraper_azure.py b/examples/azure/json_scraper_azure.py index 483544fe..5ba54f7b 100644 --- a/examples/azure/json_scraper_azure.py +++ b/examples/azure/json_scraper_azure.py @@ -23,7 +23,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/json_scraper_multi_azure.py b/examples/azure/json_scraper_multi_azure.py index ecf97280..befc4e84 100644 --- a/examples/azure/json_scraper_multi_azure.py +++ b/examples/azure/json_scraper_multi_azure.py @@ -12,7 +12,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/pdf_scraper_azure.py b/examples/azure/pdf_scraper_azure.py index f8926489..02b3b7e6 100644 --- a/examples/azure/pdf_scraper_azure.py +++ b/examples/azure/pdf_scraper_azure.py @@ -10,7 +10,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py index cfd05f1f..892996c7 100644 --- a/examples/azure/rate_limit_azure.py +++ b/examples/azure/rate_limit_azure.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o", "rate_limit": { "requests_per_second": 1 }, diff --git a/examples/azure/scrape_plain_text_azure.py b/examples/azure/scrape_plain_text_azure.py index ef0d7d1c..9ea18d07 100644 --- a/examples/azure/scrape_plain_text_azure.py +++ b/examples/azure/scrape_plain_text_azure.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/script_generator_azure.py b/examples/azure/script_generator_azure.py index 12f5d6be..b2bbb220 100644 --- a/examples/azure/script_generator_azure.py +++ b/examples/azure/script_generator_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/script_multi_generator_azure.py b/examples/azure/script_multi_generator_azure.py index a1bb8dbd..8c52cb95 100644 --- a/examples/azure/script_multi_generator_azure.py +++ b/examples/azure/script_multi_generator_azure.py @@ -16,7 +16,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_graph_azure.py b/examples/azure/search_graph_azure.py index 13547e06..949f134c 100644 --- a/examples/azure/search_graph_azure.py +++ b/examples/azure/search_graph_azure.py @@ -22,7 +22,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_graph_schema_azure.py b/examples/azure/search_graph_schema_azure.py index 629c92ab..e8c10093 100644 --- a/examples/azure/search_graph_schema_azure.py +++ b/examples/azure/search_graph_schema_azure.py @@ -30,7 +30,7 @@ class Dishes(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py index aec2297b..42ed07ad 100644 --- a/examples/azure/search_link_graph_azure.py +++ b/examples/azure/search_link_graph_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_azure.py b/examples/azure/smart_scraper_azure.py index bf3bc8d7..933dc5b0 100644 --- a/examples/azure/smart_scraper_azure.py +++ b/examples/azure/smart_scraper_azure.py @@ -26,7 +26,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_multi_azure.py b/examples/azure/smart_scraper_multi_azure.py index f1f3451e..e066eaf1 100644 --- a/examples/azure/smart_scraper_multi_azure.py +++ b/examples/azure/smart_scraper_multi_azure.py @@ -14,7 +14,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_multi_concat_azure.py b/examples/azure/smart_scraper_multi_concat_azure.py index e3870a4c..06d08b9a 100644 --- a/examples/azure/smart_scraper_multi_concat_azure.py +++ b/examples/azure/smart_scraper_multi_concat_azure.py @@ -15,7 +15,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py index d0816bf5..d2766ecb 100644 --- a/examples/azure/smart_scraper_schema_azure.py +++ b/examples/azure/smart_scraper_schema_azure.py @@ -29,7 +29,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/xml_scraper_azure.py b/examples/azure/xml_scraper_azure.py index ecfb8743..1c40f3e7 100644 --- a/examples/azure/xml_scraper_azure.py +++ b/examples/azure/xml_scraper_azure.py @@ -24,7 +24,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o" }, "verbose": True, "headless": False diff --git a/examples/azure/xml_scraper_graph_multi_azure.py b/examples/azure/xml_scraper_graph_multi_azure.py index db4db108..972eb823 100644 --- a/examples/azure/xml_scraper_graph_multi_azure.py +++ b/examples/azure/xml_scraper_graph_multi_azure.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": os.environ["AZURE_OPENAI_KEY"], - "model": "azure_openai/gpt-3.5-turbo", + "model": "azure_openai/gpt-4o", }, "verbose": True, "headless": False diff --git a/examples/bedrock/depth_search_graph_bedrock.py b/examples/bedrock/depth_search_graph_bedrock.py new file mode 100644 index 00000000..2ab88291 --- /dev/null +++ b/examples/bedrock/depth_search_graph_bedrock.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/deepseek/depth_search_graph_deepseek.py b/examples/deepseek/depth_search_graph_deepseek.py new file mode 100644 index 00000000..064690a5 --- /dev/null +++ b/examples/deepseek/depth_search_graph_deepseek.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek/deepseek-chat", + "api_key": deepseek_key, + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py index 57d422e5..a3082cf7 100644 --- a/examples/ernie/custom_graph_ernie.py +++ b/examples/ernie/custom_graph_ernie.py @@ -14,7 +14,7 @@ # Define the configuration for the graph # ************************************************ -graph_config = { +graph_config = { "llm": { "model": "ernie/ernie-bot-turbo", "ernie_client_id": "", diff --git a/examples/ernie/depth_search_graph_ernie.py b/examples/ernie/depth_search_graph_ernie.py new file mode 100644 index 00000000..99470d8d --- /dev/null +++ b/examples/ernie/depth_search_graph_ernie.py @@ -0,0 +1,26 @@ +""" +depth_search_graph_opeani example +""" +from scrapegraphai.graphs import DepthSearchGraph + +graph_config = { + "llm": { + "model": "ernie/ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/fireworks/depth_search_graph_fireworks.py b/examples/fireworks/depth_search_graph_fireworks.py new file mode 100644 index 00000000..f467be9f --- /dev/null +++ b/examples/fireworks/depth_search_graph_fireworks.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/google_genai/depth_search_graph_gemini.py b/examples/google_genai/depth_search_graph_gemini.py new file mode 100644 index 00000000..956341f4 --- /dev/null +++ b/examples/google_genai/depth_search_graph_gemini.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/google_vertexai/depth_search_graph_gemini.py b/examples/google_vertexai/depth_search_graph_gemini.py new file mode 100644 index 00000000..13bba630 --- /dev/null +++ b/examples/google_vertexai/depth_search_graph_gemini.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_vertexai/gemini-1.5-pro", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/groq/depth_search_graph_groq.py b/examples/groq/depth_search_graph_groq.py new file mode 100644 index 00000000..2d1ed8b1 --- /dev/null +++ b/examples/groq/depth_search_graph_groq.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index cec007b7..06b2f089 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -4,7 +4,6 @@ import os from dotenv import load_dotenv - from langchain_openai import OpenAIEmbeddings from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph diff --git a/examples/huggingfacehub/depth_search_graph_huggingfacehub.py b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py new file mode 100644 index 00000000..48df3e37 --- /dev/null +++ b/examples/huggingfacehub/depth_search_graph_huggingfacehub.py @@ -0,0 +1,38 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +load_dotenv() +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/local_models/depth_search_graph_ollama.py b/examples/local_models/depth_search_graph_ollama.py new file mode 100644 index 00000000..d0f960b5 --- /dev/null +++ b/examples/local_models/depth_search_graph_ollama.py @@ -0,0 +1,32 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "model": "ollama/llama3.1", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index 6e9c3da3..e80bf5ec 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -15,6 +15,7 @@ "verbose": True, "headless": False, } + FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(curr_dir, FILE_NAME) diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 35503bd7..5a5b3cea 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -24,7 +24,6 @@ class Projects(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "verbose": True, "headless": False } diff --git a/examples/mistral/depth_search_graph_mistral.py b/examples/mistral/depth_search_graph_mistral.py new file mode 100644 index 00000000..ae18ffba --- /dev/null +++ b/examples/mistral/depth_search_graph_mistral.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +mistral_key = os.getenv("MISTRAL_API_KEY") + +graph_config = { + "llm": { + "api_key": mistral_key, + "model": "mistralai/open-mistral-nemo", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/nemotron/depth_search_graph_nemotron.py b/examples/nemotron/depth_search_graph_nemotron.py new file mode 100644 index 00000000..edd80463 --- /dev/null +++ b/examples/nemotron/depth_search_graph_nemotron.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_KEY"), + "model": "claude-3-haiku-20240307", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/oneapi/depth_search_graph_onenapi.py b/examples/oneapi/depth_search_graph_onenapi.py new file mode 100644 index 00000000..7a2e7f3e --- /dev/null +++ b/examples/oneapi/depth_search_graph_onenapi.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/openai/depth_search_graph_openai.py b/examples/openai/depth_search_graph_openai.py new file mode 100644 index 00000000..dff07ad4 --- /dev/null +++ b/examples/openai/depth_search_graph_openai.py @@ -0,0 +1,30 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "openai/gpt-4o-mini", + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/examples/together/depth_search_graph_together.py b/examples/together/depth_search_graph_together.py new file mode 100644 index 00000000..7a2e7f3e --- /dev/null +++ b/examples/together/depth_search_graph_together.py @@ -0,0 +1,31 @@ +""" +depth_search_graph_opeani example +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DepthSearchGraph + +load_dotenv() + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + }, + "verbose": True, + "headless": False, + "depth": 2, + "only_inside_links": False, +} + +search_graph = DepthSearchGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io", + config=graph_config +) + +result = search_graph.run() +print(result) diff --git a/pyproject.toml b/pyproject.toml index 26b1fdb7..4c5e5117 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,9 @@ dependencies = [ "google>=3.0.0", "langchain-ollama>=0.1.3", "semchunk==2.2.0", - "transformers==4.44.2" + "transformers==4.44.2", + "qdrant-client>=1.11.3", + "fastembed>=0.3.6" ] license = "MIT" @@ -99,11 +101,6 @@ screenshot_scraper = [ "pillow>=10.4.0", ] -# Group 5: Faiss CPU -faiss-cpu = [ - "faiss-cpu>=1.8.0", -] - [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/requirements-dev.lock b/requirements-dev.lock index 1d9d469a..3423cef0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -64,6 +64,8 @@ click==8.1.7 # via burr # via streamlit # via uvicorn +coloredlogs==15.0.1 + # via onnxruntime contourpy==1.2.1 # via matplotlib cycler==0.12.1 @@ -84,9 +86,13 @@ fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 # via burr +fastembed==0.3.6 + # via scrapegraphai filelock==3.15.4 # via huggingface-hub # via transformers +flatbuffers==24.3.25 + # via onnxruntime fonttools==4.53.1 # via matplotlib free-proxy==1.1.1 @@ -132,11 +138,19 @@ greenlet==3.0.3 grpcio==1.65.4 # via google-api-core # via grpcio-status + # via grpcio-tools + # via qdrant-client grpcio-status==1.62.3 # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client h11==0.14.0 # via httpcore # via uvicorn +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 html2text==2024.2.26 # via scrapegraphai httpcore==1.0.5 @@ -149,11 +163,17 @@ httpx==0.27.0 # via langsmith # via ollama # via openai + # via qdrant-client httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.5 + # via fastembed # via tokenizers # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 idna==3.7 # via anyio # via httpx @@ -218,6 +238,7 @@ langsmith==0.1.121 # via langchain-core loguru==0.7.2 # via burr + # via fastembed lxml==5.3.0 # via free-proxy markdown-it-py==3.0.0 @@ -236,8 +257,12 @@ minify-html==0.15.0 # via scrapegraphai mistral-common==1.4.1 # via scrapegraphai +mmh3==4.1.0 + # via fastembed mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -249,19 +274,27 @@ narwhals==1.3.0 # via altair numpy==1.26.4 # via contourpy + # via fastembed # via langchain # via langchain-aws # via langchain-community # via matplotlib + # via onnx + # via onnxruntime # via opencv-python-headless # via pandas # via pyarrow # via pydeck + # via qdrant-client # via sf-hamilton # via streamlit # via transformers ollama==0.3.2 # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed openai==1.40.3 # via burr # via langchain-openai @@ -275,6 +308,7 @@ packaging==24.1 # via langchain-core # via marshmallow # via matplotlib + # via onnxruntime # via pytest # via sphinx # via streamlit @@ -284,6 +318,7 @@ pandas==2.2.2 # via sf-hamilton # via streamlit pillow==10.4.0 + # via fastembed # via matplotlib # via mistral-common # via streamlit @@ -294,6 +329,8 @@ playwright==1.45.1 # via undetected-playwright pluggy==1.5.0 # via pytest +portalocker==2.10.1 + # via qdrant-client proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core @@ -303,6 +340,9 @@ protobuf==4.25.4 # via google-generativeai # via googleapis-common-protos # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime # via proto-plus # via streamlit pyarrow==17.0.0 @@ -326,6 +366,7 @@ pydantic==2.8.2 # via mistral-common # via openai # via pydantic-settings + # via qdrant-client pydantic-core==2.20.1 # via pydantic pydantic-settings==2.5.2 @@ -343,6 +384,8 @@ pylint==3.2.6 pyparsing==3.1.2 # via httplib2 # via matplotlib +pystemmer==2.2.0.1 + # via fastembed pytest==8.0.0 # via pytest-mock pytest-mock==3.14.0 @@ -361,6 +404,8 @@ pyyaml==6.0.2 # via langchain-community # via langchain-core # via transformers +qdrant-client==1.11.3 + # via scrapegraphai referencing==0.35.1 # via jsonschema # via jsonschema-specifications @@ -369,6 +414,7 @@ regex==2024.7.24 # via transformers requests==2.32.3 # via burr + # via fastembed # via free-proxy # via google-api-core # via huggingface-hub @@ -395,6 +441,8 @@ semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 # via mistral-common +setuptools==75.1.0 + # via grpcio-tools sf-hamilton==1.73.1 # via burr six==1.16.0 @@ -406,6 +454,7 @@ sniffio==1.3.1 # via httpx # via openai snowballstemmer==2.2.0 + # via fastembed # via sphinx soupsieve==2.5 # via beautifulsoup4 @@ -434,6 +483,8 @@ starlette==0.37.2 # via fastapi streamlit==1.37.1 # via burr +sympy==1.13.3 + # via onnxruntime tenacity==8.5.0 # via langchain # via langchain-community @@ -444,6 +495,7 @@ tiktoken==0.7.0 # via mistral-common # via scrapegraphai tokenizers==0.19.1 + # via fastembed # via langchain-mistralai # via transformers toml==0.10.2 @@ -456,6 +508,7 @@ tomlkit==0.13.0 tornado==6.4.1 # via streamlit tqdm==4.66.5 + # via fastembed # via google-generativeai # via huggingface-hub # via mpire @@ -495,6 +548,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.19 # via botocore + # via qdrant-client # via requests uvicorn==0.30.5 # via burr diff --git a/requirements.lock b/requirements.lock index 84e25a0f..8949648a 100644 --- a/requirements.lock +++ b/requirements.lock @@ -41,6 +41,8 @@ certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests +coloredlogs==15.0.1 + # via onnxruntime dataclasses-json==0.6.7 # via langchain-community dill==0.3.8 @@ -49,9 +51,13 @@ distro==1.9.0 # via openai exceptiongroup==1.2.2 # via anyio +fastembed==0.3.6 + # via scrapegraphai filelock==3.15.4 # via huggingface-hub # via transformers +flatbuffers==24.3.25 + # via onnxruntime free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 @@ -87,10 +93,18 @@ greenlet==3.0.3 grpcio==1.65.1 # via google-api-core # via grpcio-status + # via grpcio-tools + # via qdrant-client grpcio-status==1.62.2 # via google-api-core +grpcio-tools==1.62.3 + # via qdrant-client h11==0.14.0 # via httpcore +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 html2text==2024.2.26 # via scrapegraphai httpcore==1.0.5 @@ -103,11 +117,17 @@ httpx==0.27.0 # via langsmith # via ollama # via openai + # via qdrant-client httpx-sse==0.4.0 # via langchain-mistralai huggingface-hub==0.24.1 + # via fastembed # via tokenizers # via transformers +humanfriendly==10.0 + # via coloredlogs +hyperframe==6.0.1 + # via h2 idna==3.7 # via anyio # via httpx @@ -156,6 +176,8 @@ langsmith==0.1.121 # via langchain # via langchain-community # via langchain-core +loguru==0.7.2 + # via fastembed lxml==5.2.2 # via free-proxy marshmallow==3.21.3 @@ -164,8 +186,12 @@ minify-html==0.15.0 # via scrapegraphai mistral-common==1.4.1 # via scrapegraphai +mmh3==4.1.0 + # via fastembed mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -174,14 +200,22 @@ multiprocess==0.70.16 mypy-extensions==1.0.0 # via typing-inspect numpy==1.26.4 + # via fastembed # via langchain # via langchain-aws # via langchain-community + # via onnx + # via onnxruntime # via opencv-python-headless # via pandas + # via qdrant-client # via transformers ollama==0.3.2 # via langchain-ollama +onnx==1.17.0 + # via fastembed +onnxruntime==1.19.2 + # via fastembed openai==1.41.0 # via langchain-openai opencv-python-headless==4.10.0.84 @@ -192,14 +226,18 @@ packaging==24.1 # via huggingface-hub # via langchain-core # via marshmallow + # via onnxruntime # via transformers pandas==2.2.2 # via scrapegraphai pillow==10.4.0 + # via fastembed # via mistral-common playwright==1.45.1 # via scrapegraphai # via undetected-playwright +portalocker==2.10.1 + # via qdrant-client proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core @@ -209,6 +247,9 @@ protobuf==4.25.3 # via google-generativeai # via googleapis-common-protos # via grpcio-status + # via grpcio-tools + # via onnx + # via onnxruntime # via proto-plus pyasn1==0.6.0 # via pyasn1-modules @@ -226,6 +267,7 @@ pydantic==2.8.2 # via mistral-common # via openai # via pydantic-settings + # via qdrant-client pydantic-core==2.20.1 # via pydantic pydantic-settings==2.5.2 @@ -236,6 +278,8 @@ pygments==2.18.0 # via mpire pyparsing==3.1.2 # via httplib2 +pystemmer==2.2.0.1 + # via fastembed python-dateutil==2.9.0.post0 # via botocore # via pandas @@ -250,6 +294,8 @@ pyyaml==6.0.1 # via langchain-community # via langchain-core # via transformers +qdrant-client==1.11.3 + # via scrapegraphai referencing==0.35.1 # via jsonschema # via jsonschema-specifications @@ -257,6 +303,7 @@ regex==2024.5.15 # via tiktoken # via transformers requests==2.32.3 + # via fastembed # via free-proxy # via google-api-core # via huggingface-hub @@ -279,17 +326,23 @@ semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 # via mistral-common +setuptools==75.1.0 + # via grpcio-tools six==1.16.0 # via python-dateutil sniffio==1.3.1 # via anyio # via httpx # via openai +snowballstemmer==2.2.0 + # via fastembed soupsieve==2.5 # via beautifulsoup4 sqlalchemy==2.0.31 # via langchain # via langchain-community +sympy==1.13.3 + # via onnxruntime tenacity==8.5.0 # via langchain # via langchain-community @@ -299,9 +352,11 @@ tiktoken==0.7.0 # via mistral-common # via scrapegraphai tokenizers==0.19.1 + # via fastembed # via langchain-mistralai # via transformers tqdm==4.66.4 + # via fastembed # via google-generativeai # via huggingface-hub # via mpire @@ -333,6 +388,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.19 # via botocore + # via qdrant-client # via requests yarl==1.9.4 # via aiohttp diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index efd6bd7e..b5ffcc47 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -26,3 +26,4 @@ from .screenshot_scraper_graph import ScreenshotScraperGraph from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .code_generator_graph import CodeGeneratorGraph +from .depth_search_graph import DepthSearchGraph diff --git a/scrapegraphai/graphs/depth_search_graph.py b/scrapegraphai/graphs/depth_search_graph.py new file mode 100644 index 00000000..13b39129 --- /dev/null +++ b/scrapegraphai/graphs/depth_search_graph.py @@ -0,0 +1,151 @@ +""" +... Module +""" +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..utils.save_code_to_file import save_code_to_file +from ..nodes import ( + FetchNodeLevelK, + ParseNodeDepthK, + DescriptionNode, + RAGNode, + GenerateAnswerNodeKLevel +) + +class DepthSearchGraph(AbstractGraph): + """ + CodeGeneratorGraph is a script generator pipeline that generates + the function extract_data(html: str) -> dict() for + extracting the wanted information from a HTML page. The + code generated is in Python and uses the library BeautifulSoup. + It requires a user prompt, a source URL, and an output schema. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + library (str): The library used for web scraping (beautiful soup). + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> code_gen = CodeGeneratorGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "openai/gpt-3.5-turbo"}} + ... ) + >>> result = code_gen.run() + ) + """ + + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): + + super().__init__(prompt, config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node_k = FetchNodeLevelK( + input="url| local_dir", + output=["docs"], + node_config={ + "loader_kwargs": self.config.get("loader_kwargs", {}), + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "browser_base": self.config.get("browser_base"), + "depth": self.config.get("depth", 1), + "only_inside_links": self.config.get("only_inside_links", False) + } + ) + + parse_node_k = ParseNodeDepthK( + input="docs", + output=["docs"], + node_config={ + "verbose": self.config.get("verbose", False) + } + ) + + description_node = DescriptionNode( + input="docs", + output=["docs"], + node_config={ + "llm_model": self.llm_model, + "verbose": self.config.get("verbose", False), + "cache_path": self.config.get("cache_path", False) + } + ) + + rag_node = RAGNode ( + input="docs", + output=["vectorial_db"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.config.get("embedder_model", False), + "verbose": self.config.get("verbose", False), + } + ) + + generate_answer_k = GenerateAnswerNodeKLevel( + input="vectorial_db", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.config.get("embedder_model", False), + "verbose": self.config.get("verbose", False), + } + + ) + + return BaseGraph( + nodes=[ + fetch_node_k, + parse_node_k, + description_node, + rag_node, + generate_answer_k + ], + edges=[ + (fetch_node_k, parse_node_k), + (parse_node_k, description_node), + (description_node, rag_node), + (rag_node, generate_answer_k) + ], + entry_point=fetch_node_k, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the generated code. + + Returns: + str: The generated code. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + docs = self.final_state.get("answer", "No answer") + + return docs diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index ec16c48e..edb195a5 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -28,3 +28,7 @@ from .generate_code_node import GenerateCodeNode from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode +from .fetch_node_level_k import FetchNodeLevelK +from .generate_answer_node_k_level import GenerateAnswerNodeKLevel +from .description_node import DescriptionNode +from .parse_node_depth_k import ParseNodeDepthK diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py new file mode 100644 index 00000000..4201a61d --- /dev/null +++ b/scrapegraphai/nodes/description_node.py @@ -0,0 +1,67 @@ +""" +DescriptionNode Module +""" +from typing import List, Optional +from tqdm import tqdm +from langchain.prompts import PromptTemplate +from langchain_core.runnables import RunnableParallel +from .base_node import BaseNode +from ..prompts.description_node_prompts import DESCRIPTION_NODE_PROMPT + +class DescriptionNode(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "DESCRIPTION", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm_model"] + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + self.logger.info(f"--- Executing {self.node_name} Node ---") + + docs = [elem for elem in state.get("docs")] + + chains_dict = {} + + for i, chunk in enumerate(tqdm(docs, desc="Processing chunks", disable=not self.verbose)): + prompt = PromptTemplate( + template=DESCRIPTION_NODE_PROMPT, + partial_variables={"content": chunk.get("document")} + ) + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model + + async_runner = RunnableParallel(**chains_dict) + batch_results = async_runner.invoke({}) + + + for i in range(1, len(docs)+1): + docs[i-1]["summary"] = batch_results.get(f"chunk{i}").content + + state.update({self.output[0]: docs}) + + return state diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py new file mode 100644 index 00000000..d321b33c --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -0,0 +1,214 @@ +from typing import List, Optional +from .base_node import BaseNode +from ..docloaders import ChromiumLoader +from ..utils.cleanup_html import cleanup_html +from ..utils.convert_to_md import convert_to_md +from langchain_core.documents import Document +from bs4 import BeautifulSoup +from urllib.parse import quote, urljoin + +class FetchNodeLevelK(BaseNode): + """ + A node responsible for fetching the HTML content of a specified URL and all its sub-links + recursively up to a certain level of hyperlink the graph. This content is then used to update + the graph's state. It uses ChromiumLoader to fetch the content from a web page asynchronously + (with proxy protection). + + Attributes: + embedder_model: An optional model for embedding the fetched content. + verbose (bool): A flag indicating whether to show print statements during execution. + cache_path (str): Path to cache fetched content. + headless (bool): Whether to run the Chromium browser in headless mode. + loader_kwargs (dict): Additional arguments for the content loader. + browser_base (dict): Optional configuration for the browser base API. + depth (int): Maximum depth of hyperlink graph traversal. + only_inside_links (bool): Whether to fetch only internal links. + min_input_len (int): Minimum required length of input data. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "FetchLevelK", + ): + """ + Initializes the FetchNodeLevelK instance. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The name of the node (default is "FetchLevelK"). + """ + super().__init__(node_name, "node", input, output, 2, node_config) + + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = node_config.get("verbose", False) if node_config else False + self.cache_path = node_config.get("cache_path", False) + self.headless = node_config.get("headless", True) if node_config else True + self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} + self.browser_base = node_config.get("browser_base", None) + self.depth = node_config.get("depth", 1) if node_config else 1 + self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False + self.min_input_len = 1 + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to fetch the HTML content of a specified URL and its sub-links + recursively, then updates the graph's state with the fetched content. + + Args: + state (dict): The current state of the graph. + + Returns: + dict: The updated state with a new output key containing the fetched HTML content. + + Raises: + KeyError: If the input key is not found in the state. + """ + self.logger.info(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + input_data = [state[key] for key in input_keys] + source = input_data[0] + + documents = [{"source": source}] + loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {} + + for _ in range(self.depth): + documents = self.obtain_content(documents, loader_kwargs) + + filtered_documents = [doc for doc in documents if 'document' in doc] + state.update({self.output[0]: filtered_documents}) + return state + + def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: + """ + Fetches the HTML content of a given source URL. + + Args: + source (str): The URL to fetch content from. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + Optional[str]: The fetched HTML content or None if fetching failed. + """ + self.logger.info(f"--- (Fetching HTML from: {source}) ---") + + if self.browser_base is not None: + try: + from ..docloaders.browser_base import browser_base_fetch + except ImportError: + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") + + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + document = [Document(page_content=content, metadata={"source": source}) for content in data] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() + return document + + def extract_links(self, html_content: str) -> list: + """ + Extracts all hyperlinks from the HTML content. + + Args: + html_content (str): The HTML content to extract links from. + + Returns: + list: A list of extracted hyperlinks. + """ + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + self.logger.info(f"Extracted {len(links)} links.") + return links + + def get_full_links(self, base_url: str, links: list) -> list: + """ + Converts relative URLs to full URLs based on the base URL. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to convert. + + Returns: + list: A list of full URLs. + """ + full_links = [] + for link in links: + if self.only_inside_links and link.startswith("http"): + continue + full_link = link if link.startswith("http") else urljoin(base_url, link) + full_links.append(full_link) + return full_links + + def obtain_content(self, documents: List, loader_kwargs) -> List: + """ + Iterates through documents, fetching and updating content recursively. + + Args: + documents (List): A list of documents containing the source URLs. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + List: The updated list of documents with fetched content. + """ + new_documents = [] + for doc in documents: + source = doc['source'] + if 'document' not in doc: + document = self.fetch_content(source, loader_kwargs) + + if not document or not document[0].page_content.strip(): + self.logger.warning(f"Failed to fetch content for {source}") + documents.remove(doc) + continue + + doc['document'] = document + links = self.extract_links(doc['document'][0].page_content) + full_links = self.get_full_links(source, links) + + for link in full_links: + if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): + new_documents.append({"source": link}) + + documents.extend(new_documents) + return documents + + def process_links(self, base_url: str, links: list, + loader_kwargs, depth: int, current_depth: int = 1) -> dict: + """ + Processes a list of links recursively up to a given depth. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to process. + loader_kwargs (dict): Additional arguments for the content loader. + depth (int): The maximum depth for recursion. + current_depth (int): The current depth of recursion (default is 1). + + Returns: + dict: A dictionary containing processed link content. + """ + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + self.logger.info(f"Processing link {idx}: {full_link}") + link_content = self.fetch_content(full_link, loader_kwargs) + + if current_depth < depth: + new_links = self.extract_links(link_content) + content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1)) + else: + self.logger.warning(f"Failed to fetch content for {full_link}") + return content_dict diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 15686ec1..d5034a1e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,3 +1,6 @@ +""" +generate_answer_node module +""" from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -15,6 +18,26 @@ ) class GenerateAnswerNode(BaseNode): + """ + Initializes the GenerateAnswerNode class. + + Args: + input (str): The input data type for the node. + output (List[str]): The output data type(s) for the node. + node_config (Optional[dict]): Configuration dictionary for the node, + which includes the LLM model, verbosity, schema, and other settings. + Defaults to None. + node_name (str): The name of the node. Defaults to "GenerateAnswer". + + Attributes: + llm_model: The language model specified in the node configuration. + verbose (bool): Whether verbose mode is enabled. + force (bool): Whether to force certain behaviors, overriding defaults. + script_creator (bool): Whether the node is in script creation mode. + is_md_scraper (bool): Whether the node is scraping markdown data. + additional_info (Optional[str]): Any additional information to be + included in the prompt templates. + """ def __init__( self, input: str, @@ -100,7 +123,9 @@ def execute(self, state: dict) -> dict: prompt = PromptTemplate( template=template_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions} + partial_variables={"context": chunk, + "chunk_id": i + 1, + "format_instructions": format_instructions} ) chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py new file mode 100644 index 00000000..291109f2 --- /dev/null +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -0,0 +1,150 @@ +""" +GenerateAnswerNodeKLevel Module +""" +from typing import List, Optional +from langchain.prompts import PromptTemplate +from tqdm import tqdm +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.runnables import RunnableParallel +from langchain_openai import ChatOpenAI, AzureChatOpenAI +from langchain_mistralai import ChatMistralAI +from langchain_aws import ChatBedrock +from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser +from .base_node import BaseNode +from ..prompts import ( + TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE, + TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD +) + +class GenerateAnswerNodeKLevel(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "GANLK", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = node_config.get("verbose", False) + self.force = node_config.get("force", False) + self.script_creator = node_config.get("script_creator", False) + self.is_md_scraper = node_config.get("is_md_scraper", False) + self.additional_info = node_config.get("additional_info") + + def execute(self, state: dict) -> dict: + self.logger.info(f"--- Executing {self.node_name} Node ---") + + user_prompt = state.get("user_prompt") + + if self.node_config.get("schema", None) is not None: + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema=self.node_config["schema"] + ) + output_parser = get_structured_output_parser(self.node_config["schema"]) + format_instructions = "NA" + else: + if not isinstance(self.llm_model, ChatBedrock): + output_parser = get_pydantic_output_parser(self.node_config["schema"]) + format_instructions = output_parser.get_format_instructions() + else: + output_parser = None + format_instructions = "" + else: + if not isinstance(self.llm_model, ChatBedrock): + output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + else: + output_parser = None + format_instructions = "" + + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \ + and not self.script_creator \ + or self.force \ + and not self.script_creator or self.is_md_scraper: + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD + template_chunks_prompt = TEMPLATE_CHUNKS_MD + template_merge_prompt = TEMPLATE_MERGE_MD + else: + template_no_chunks_prompt = TEMPLATE_NO_CHUNKS + template_chunks_prompt = TEMPLATE_CHUNKS + template_merge_prompt = TEMPLATE_MERGE + + if self.additional_info is not None: + template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt + template_chunks_prompt = self.additional_info + template_chunks_prompt + template_merge_prompt = self.additional_info + template_merge_prompt + + client = state["vectorial_db"] + + if state.get("embeddings"): + import openai + openai_client = openai.Client() + + answer_db = client.search( + collection_name="collection", + query_vector=openai_client.embeddings.create( + input=["What is the best to use for vector search scaling?"], + model=state.get("embeddings").get("model"), + ) + .data[0] + .embedding, + ) + else: + answer_db = client.query( + collection_name="vectorial_collection", + query_text=user_prompt + ) + + chains_dict = {} + elems =[state.get("docs")[elem.id-1] for elem in answer_db if elem.score>0.5] + + for i, chunk in enumerate(tqdm(elems, + desc="Processing chunks", disable=not self.verbose)): + prompt = PromptTemplate( + template=template_chunks_prompt, + input_variables=["format_instructions"], + partial_variables={"context": chunk.get("document"), + "chunk_id": i + 1, + } + ) + chain_name = f"chunk{i+1}" + chains_dict[chain_name] = prompt | self.llm_model + + async_runner = RunnableParallel(**chains_dict) + batch_results = async_runner.invoke({"format_instructions": user_prompt}) + + merge_prompt = PromptTemplate( + template=template_merge_prompt, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions} + ) + + merge_chain = merge_prompt | self.llm_model + if output_parser: + merge_chain = merge_chain | output_parser + answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + + state["answer"] = answer + + return state diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index cc72aaf4..746b10a5 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -26,7 +26,6 @@ from .base_node import BaseNode from jsonschema import validate, ValidationError - class GenerateCodeNode(BaseNode): """ A node that generates Python code for a function that extracts data @@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict: Raises: KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. - RuntimeError: If the maximum number of iterations is + RuntimeError: If the maximum number of iterations is reached without obtaining the desired code. """ @@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict: self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---") state = self.semantic_comparison_loop(state) if state["errors"]["semantic"]: - continue + continue break if state["iteration"] == self.max_iterations["overall"] and \ @@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict: state["errors"]["syntax"] = [syntax_message] self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---") analysis = syntax_focused_analysis(state, self.llm_model) - self.logger.info(f"""--- (Regenerating Code + self.logger.info(f"""--- (Regenerating Code to fix the Error) ---""") - state["generated_code"] = syntax_focused_code_generation(state, + state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state @@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict: self.logger.info(f"--- (Code Execution Error: {execution_result}) ---") analysis = execution_focused_analysis(state, self.llm_model) self.logger.info(f"--- (Regenerating Code to fix the Error) ---") - state["generated_code"] = execution_focused_code_generation(state, + state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state def validation_reasoning_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["validation"]): - validation, errors = self.validate_dict(state["execution_result"], + validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema()) if validation: state["errors"]["validation"] = [] @@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict: def semantic_comparison_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["semantic"]): - comparison_result = self.semantic_comparison(state["execution_result"], + comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"]) if comparison_result["are_semantically_equivalent"]: state["errors"]["semantic"] = [] @@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code): if not extract_data: raise NameError("Function 'extract_data' not found in the generated code.") - result = extract_data(self.raw_html) + result = extract_data(self.raw_html) return True, result except Exception as e: return False, f"Error during execution: {str(e)}" @@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema): validate(instance=data, schema=schema) return True, None except ValidationError as e: - errors = e.errors() + errors = [e.message] return False, errors diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py new file mode 100644 index 00000000..6427b051 --- /dev/null +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -0,0 +1,67 @@ +""" +ParseNodeDepthK Module +""" +from typing import List, Optional +from langchain_community.document_transformers import Html2TextTransformer +from .base_node import BaseNode + +class ParseNodeDepthK(BaseNode): + """ + A node responsible for parsing HTML content from a series of documents. + + This node enhances the scraping workflow by allowing for targeted extraction of + content, thereby optimizing the processing of large HTML documents. + + Attributes: + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "ParseNodeDepthK", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + """ + Executes the node's logic to parse the HTML documents content. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. + + Returns: + dict: The updated state with the output key containing the parsed content chunks. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + input_data = [state[key] for key in input_keys] + + documents = input_data[0] + + for doc in documents: + document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"]) + doc["document"] = document_md[0].page_content + + state.update({self.output[0]: documents}) + + return state diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 1174beee..b67c50e9 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -1,29 +1,10 @@ """ RAGNode Module """ -import os -import sys from typing import List, Optional -from langchain.docstore.document import Document -from langchain.retrievers import ContextualCompressionRetriever -from langchain.retrievers.document_compressors import ( - DocumentCompressorPipeline, - EmbeddingsFilter, -) -from langchain_community.document_transformers import EmbeddingsRedundantFilter -from langchain_community.vectorstores import FAISS -from langchain_community.chat_models import ChatOllama -from langchain_community.embeddings import OllamaEmbeddings -from langchain_aws import BedrockEmbeddings, ChatBedrock -from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from ..utils.logging import get_logger from .base_node import BaseNode -from ..helpers import models_tokens -from ..models import DeepSeek - -optional_modules = {"langchain_anthropic", "langchain_fireworks", - "langchain_groq", "langchain_google_vertexai"} +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct, VectorParams, Distance class RAGNode(BaseNode): """ @@ -34,7 +15,6 @@ class RAGNode(BaseNode): Attributes: llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. Args: @@ -58,125 +38,62 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: - # Execution logic - pass + self.logger.info(f"--- Executing {self.node_name} Node ---") + + if self.node_config.get("client_type") in ["memory", None]: + client = QdrantClient(":memory:") + elif self.node_config.get("client_type") == "local_db": + client = QdrantClient(path="path/to/db") + elif self.node_config.get("client_type") == "image": + client = QdrantClient(url="http://localhost:6333") + else: + raise ValueError("client_type provided not correct") + + docs = [elem.get("summary") for elem in state.get("docs")] + ids = [i for i in range(1, len(state.get("docs"))+1)] + + if state.get("embeddings"): + import openai + openai_client = openai.Client() + + files = state.get("documents") + + array_of_embeddings = [] + i=0 - def _create_default_embedder(self, llm_config=None) -> object: - """ - Create an embedding model instance based on the chosen llm model. + for file in files: + embeddings = openai_client.embeddings.create(input=file, + model=state.get("embeddings").get("model")) + i+=1 + points = PointStruct( + id=i, + vector=embeddings, + payload={"text": file}, + ) - Returns: - object: An instance of the embedding model client. + array_of_embeddings.append(points) - Raises: - ValueError: If the model is not supported. - """ + collection_name = "collection" - if isinstance(self.llm_model, ChatGoogleGenerativeAI): - return GoogleGenerativeAIEmbeddings( - google_api_key=llm_config["api_key"], model="models/embedding-001" + client.create_collection( + collection_name, + vectors_config=VectorParams( + size=1536, + distance=Distance.COSINE, + ), ) - if isinstance(self.llm_model, ChatOpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, - base_url=self.llm_model.openai_api_base) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, AzureOpenAIEmbeddings): - return self.llm_model - elif isinstance(self.llm_model, AzureChatOpenAI): - return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, ChatOllama): - params = self.llm_model._lc_kwargs - params.pop("streaming", None) - params.pop("temperature", None) - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, ChatBedrock): - return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) - elif all(key in sys.modules for key in optional_modules): - if isinstance(self.llm_model, ChatFireworks): - from langchain_fireworks import FireworksEmbeddings - return FireworksEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatNVIDIA): - from langchain_nvidia import NVIDIAEmbeddings - return NVIDIAEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatHuggingFace): - from langchain_huggingface import HuggingFaceEmbeddings - return HuggingFaceEmbeddings(model=self.llm_model.model) - if isinstance(self.llm_model, ChatVertexAI): - from langchain_vertexai import VertexAIEmbeddings - return VertexAIEmbeddings() - else: - raise ValueError("Embedding Model missing or not supported") - - def _create_embedder(self, embedder_config: dict) -> object: - """ - Create an embedding model instance based on the configuration provided. - - Args: - embedder_config (dict): Configuration parameters for the embedding model. - - Returns: - object: An instance of the embedding model client. - - Raises: - KeyError: If the model is not supported. - """ - embedder_params = {**embedder_config} - if "model_instance" in embedder_config: - return embedder_params["model_instance"] - if "openai" in embedder_params["model"]: - return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - if "azure" in embedder_params["model"]: - return AzureOpenAIEmbeddings() - if "ollama" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["ollama"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OllamaEmbeddings(**embedder_params) - if "gemini" in embedder_params["model"]: - try: - models_tokens["gemini"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - if "bedrock" in embedder_params["model"]: - embedder_params["model"] = embedder_params["model"].split("/")[-1] - client = embedder_params.get("client", None) - try: - models_tokens["bedrock"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - if all(key in sys.modules for key in optional_modules): - if "hugging_face" in embedder_params["model"]: - from langchain_huggingface import HuggingFaceEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceEmbeddings(model=embedder_params["model"]) - elif "fireworks" in embedder_params["model"]: - from langchain_fireworks import FireworksEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) - elif "nvidia" in embedder_params["model"]: - from langchain_nvidia import NVIDIAEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) - - raise ValueError("Model provided by the configuration not supported") + client.upsert(collection_name, points) + + state["vectorial_db"] = client + return state + + client.add( + collection_name="vectorial_collection", + documents=docs, + ids=ids + ) + + state["vectorial_db"] = client + return state diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py new file mode 100644 index 00000000..20df481a --- /dev/null +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -0,0 +1,10 @@ +""" +description node prompts +""" + +DESCRIPTION_NODE_PROMPT = """ +You are a scraper and you have just scraped the +following content from a website. \n +Please provide a description summary of maximum of 20 words +Content of the website: {content} +""" \ No newline at end of file diff --git a/scrapegraphai/prompts/generate_answer_node_prompts.py b/scrapegraphai/prompts/generate_answer_node_prompts.py index 7c098fe2..1b336fb4 100644 --- a/scrapegraphai/prompts/generate_answer_node_prompts.py +++ b/scrapegraphai/prompts/generate_answer_node_prompts.py @@ -2,6 +2,7 @@ Generate answer node prompts """ + TEMPLATE_CHUNKS_MD = """ You are a website scraper and you have just scraped the following content from a website converted in markdown format. @@ -32,6 +33,7 @@ You are now asked to answer a user question about the content you have scraped.\n You have scraped many chunks since the website is big and now you are asked to merge them into a single answer without repetitions (if there are any).\n Make sure that if a maximum number of items is specified in the instructions that you get that maximum number and do not exceed it. \n +The structure should be coherent. \n Make sure the output format is a valid JSON and does not contain errors. \n OUTPUT INSTRUCTIONS: {format_instructions}\n USER QUESTION: {question}\n diff --git a/scrapegraphai/utils/1_manual.py b/scrapegraphai/utils/1_manual.py new file mode 100644 index 00000000..21703b7b --- /dev/null +++ b/scrapegraphai/utils/1_manual.py @@ -0,0 +1,92 @@ +import requests +import logging +import time +from urllib.parse import quote, urljoin +from typing import Optional +from bs4 import BeautifulSoup +from dotenv import load_dotenv +import os +import json +import markdownify + +load_dotenv() + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def fetch_content(token: str, target_url: str, max_retries: int = 5, retry_delay: int = 3) -> Optional[str]: + encoded_url = quote(target_url) + url = f"http://api.scrape.do?url={encoded_url}&token={token}&render=true&waitUntil=networkidle0" + + for attempt in range(max_retries): + try: + response = requests.get(url) + if response.status_code == 200: + logging.info(f"Successfully fetched content from {target_url}") + return response.text + logging.warning(f"Failed with status {response.status_code}. Retrying in {retry_delay}s...") + except requests.RequestException as e: + logging.error(f"Error fetching {target_url}: {e}. Retrying in {retry_delay}s...") + time.sleep(retry_delay) + + logging.error(f"Failed to fetch {target_url} after {max_retries} attempts.") + return None + +def extract_links(html_content: str) -> list: + soup = BeautifulSoup(html_content, 'html.parser') + links = [link['href'] for link in soup.find_all('a', href=True)] + logging.info(f"Extracted {len(links)} links.") + return links + +def process_links(token: str, base_url: str, links: list, depth: int, current_depth: int = 1) -> dict: + content_dict = {} + for idx, link in enumerate(links, start=1): + full_link = link if link.startswith("http") else urljoin(base_url, link) + logging.info(f"Processing link {idx}: {full_link}") + link_content = fetch_content(token, full_link) + if link_content: + markdown_content = markdownify.markdownify(link_content, heading_style="ATX") + content_dict[full_link] = markdown_content + save_content_to_json(content_dict, idx) + + if current_depth < depth: + new_links = extract_links(link_content) + content_dict.update(process_links(token, full_link, new_links, depth, current_depth + 1)) + else: + logging.warning(f"Failed to fetch content for {full_link}") + return content_dict + +def save_content_to_json(content_dict: dict, idx: int): + if not os.path.exists("downloaded_pages"): + os.makedirs("downloaded_pages") + + file_name = f"scraped_content_{idx}.json" + file_path = os.path.join("downloaded_pages", file_name) + + with open(file_path, "w", encoding="utf-8") as json_file: + json.dump(content_dict, json_file, ensure_ascii=False, indent=4) + + logging.info(f"Content saved to {file_path}") + +if __name__ == "__main__": + token = os.getenv("TOKEN") + target_url = "https://www.wired.com" + depth = 2 + + if not token or not target_url: + logging.error("Please set the TOKEN and TARGET_URL environment variables.") + exit(1) + + html_content = fetch_content(token, target_url) + + if html_content: + links = extract_links(html_content) + logging.info("Links found:") + for link in links: + logging.info(link) + + content_dict = process_links(token, target_url, links, depth) + for link, content in content_dict.items(): + logging.info(f"Link: {link}") + logging.info(f"Content: {content[:500]}...") + else: + logging.error("Failed to fetch the content.")