diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index e5427044..2a0f261a 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -25,4 +25,5 @@ from .concat_answers_node import ConcatAnswersNode from .prompt_refiner_node import PromptRefinerNode from .html_analyzer_node import HtmlAnalyzerNode -from .generate_code_node import GenerateCodeNode \ No newline at end of file +from .generate_code_node import GenerateCodeNode +from .reasoning_node import ReasoningNode \ No newline at end of file diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index 1174a4aa..bcb7ea74 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -5,17 +5,16 @@ from langchain.prompts import PromptTemplate from langchain.output_parsers import ResponseSchema, StructuredOutputParser from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass from langchain_community.chat_models import ChatOllama import ast import sys from io import StringIO from bs4 import BeautifulSoup import re -from tqdm import tqdm -from .base_node import BaseNode +import json +from jsonschema import validate, ValidationError from pydantic import ValidationError +from .base_node import BaseNode from ..utils import (transform_schema, extract_code, syntax_focused_analysis, syntax_focused_code_generation, @@ -23,8 +22,6 @@ validation_focused_analysis, validation_focused_code_generation, semantic_focused_analysis, semantic_focused_code_generation, are_content_equal) -from jsonschema import validate, ValidationError -import json from ..prompts import ( TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON ) diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py index b07c4040..26304dcd 100644 --- a/scrapegraphai/nodes/html_analyzer_node.py +++ b/scrapegraphai/nodes/html_analyzer_node.py @@ -4,10 +4,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass from langchain_community.chat_models import ChatOllama -from tqdm import tqdm from .base_node import BaseNode from ..utils import reduce_html from ..prompts import ( diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py index dfb62eb6..7cc53020 100644 --- a/scrapegraphai/nodes/prompt_refiner_node.py +++ b/scrapegraphai/nodes/prompt_refiner_node.py @@ -4,12 +4,7 @@ from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnableParallel -from langchain_core.utils.pydantic import is_basemodel_subclass -from langchain_openai import ChatOpenAI, AzureChatOpenAI -from langchain_mistralai import ChatMistralAI from langchain_community.chat_models import ChatOllama -from tqdm import tqdm from .base_node import BaseNode from ..utils import transform_schema from ..prompts import ( diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py new file mode 100644 index 00000000..431d8ab1 --- /dev/null +++ b/scrapegraphai/nodes/reasoning_node.py @@ -0,0 +1,96 @@ +""" +PromptRefinerNode Module +""" +from typing import List, Optional +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_community.chat_models import ChatOllama +from .base_node import BaseNode +from ..utils import transform_schema +from ..prompts import ( + TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT +) + +class ReasoningNode(BaseNode): + """ + A node that refine the user prompt with the use of the schema and additional context and + create a precise prompt in subsequent steps that explicitly link elements in the user's + original input to their corresponding representations in the JSON schema. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "PromptRefiner", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + + if isinstance(node_config["llm_model"], ChatOllama): + self.llm_model.format="json" + + self.verbose = ( + True if node_config is None else node_config.get("verbose", False) + ) + self.force = ( + False if node_config is None else node_config.get("force", False) + ) + + self.additional_info = node_config.get("additional_info", None) + + self.output_schema = node_config.get("schema") + + def execute(self, state: dict) -> dict: + """ + Generate a refined prompt for the reasoning task based on the user's input and the JSON schema. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. + + Returns: + dict: The updated state with the output key containing the generated answer. + + Raises: + KeyError: If the input keys are not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + self.logger.info(f"--- Executing {self.node_name} Node ---") + + user_prompt = state['user_prompt'] + + self.simplefied_schema = transform_schema(self.output_schema.schema()) + + if self.additional_info is not None: + prompt = PromptTemplate( + template=TEMPLATE_REASONING_WITH_CONTEXT, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema), + "additional_context": self.additional_info}) + else: + prompt = PromptTemplate( + template=TEMPLATE_REASONING, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema)}) + + output_parser = StrOutputParser() + + chain = prompt | self.llm_model | output_parser + refined_prompt = chain.invoke({}) + + state.update({self.output[0]: refined_prompt}) + return state diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py index f7be89c1..ab34580b 100644 --- a/scrapegraphai/prompts/__init__.py +++ b/scrapegraphai/prompts/__init__.py @@ -18,4 +18,5 @@ TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS, - TEMPLATE_SEMANTIC_CODE_GENERATION) \ No newline at end of file + TEMPLATE_SEMANTIC_CODE_GENERATION) +from .reasoning_node_prompts import TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT \ No newline at end of file diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py new file mode 100644 index 00000000..47ceaa41 --- /dev/null +++ b/scrapegraphai/prompts/reasoning_node_prompts.py @@ -0,0 +1,72 @@ +""" +Reasoning prompts helper +""" + +TEMPLATE_REASONING = """ +**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file. + +**User's Request**: +{user_input} + +**Target JSON Schema**: +```json +{json_schema} +``` + +**Analysis Instructions**: +1. **Interpret User Request:** +* Identify the key information types or entities the user is seeking. +* Note any specific attributes, relationships, or constraints mentioned. + +2. **Map to JSON Schema**: +* For each identified element in the user request, locate its corresponding field in the JSON schema. +* Explain how the schema structure represents the requested information. +* Highlight any relevant schema elements not explicitly mentioned in the user's request. + +3. **Data Transformation Guidance**: +* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. + +This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. + +**Reasoning Output**: +[Your detailed analysis based on the above instructions] +""" + +TEMPLATE_REASONING_WITH_CONTEXT = """ +**Task**: Analyze the user's request and the provided JSON schema to guide an LLM in extracting information directly from a markdown file previously parsed froma a HTML file. + +**User's Request**: +{user_input} + +**Target JSON Schema**: +```json +{json_schema} +``` + +**Additional Context**: +{additional_context} + +**Analysis Instructions**: +1. **Interpret User Request and Context:** +* Identify the key information types or entities the user is seeking. +* Note any specific attributes, relationships, or constraints mentioned. +* Incorporate insights from the additional context to refine understanding of the task. + +2. **Map to JSON Schema**: +* For each identified element in the user request, locate its corresponding field in the JSON schema. +* Explain how the schema structure represents the requested information. +* Highlight any relevant schema elements not explicitly mentioned in the user's request. + +3. **Extraction Strategy**: +* Based on the additional context, suggest specific strategies for locating and extracting the required information from the HTML. +* Highlight any potential challenges or special considerations mentioned in the context. + +4. **Data Transformation Guidance**: +* Provide guidance on any necessary transformations to align extracted data with the JSON schema requirements. +* Note any special formatting, validation, or business logic considerations from the additional context. + +This analysis will be used to instruct an LLM that has the HTML content in its context. The LLM will use this guidance to extract the information and return it directly in the specified JSON format. + +**Reasoning Output**: +[Your detailed analysis based on the above instructions, incorporating insights from the additional context] +"""