diff --git a/CHANGELOG.md b/CHANGELOG.md index cb79a8a7..e09ddccc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## [1.25.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.1...v1.25.2) (2024-10-03) + + +### Bug Fixes + +* update dependencies ([7579d0e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7579d0e2599d63c0003b1b7a0918132511a9c8f1)) + +## [1.25.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.25.1) (2024-09-29) ## [1.26.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.2...v1.26.0-beta.3) (2024-10-04) diff --git a/README.md b/README.md index 5d79bf55..94beb617 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,6 @@ The output will be a dictionary like the following: "contact_email": "contact@scrapegraphai.com" } ``` - There are other pipelines that can be used to extract information from multiple pages, generate Python scripts, or even generate audio files. | Pipeline Name | Description | @@ -110,6 +109,8 @@ There are other pipelines that can be used to extract information from multiple | SmartScraperMultiGraph | Multi-page scraper that extracts information from multiple pages given a single prompt and a list of sources. | | ScriptCreatorMultiGraph | Multi-page scraper that generates a Python script for extracting information from multiple pages and sources. | +For each of these graphs there is the multi version. It allows to make calls of the LLM in parallel. + It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**, **Azure** and **Gemini**, or local models using **Ollama**. Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command, if you want to use local models. @@ -140,6 +141,9 @@ Check out also the Docusaurus [here](https://scrapegraph-doc.onrender.com/). Stats + + Stats + ## 🀝 Contributing @@ -152,34 +156,6 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) [![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) -## πŸ—ΊοΈ Roadmap - -We are working on the following features! If you are interested in collaborating right-click on the feature and open in a new tab to file a PR. If you have doubts and wanna discuss them with us, just contact us on [discord](https://discord.gg/uJN7TYcpNa) or open a [Discussion](https://github.com/VinciGit00/Scrapegraph-ai/discussions) here on Github! - -```mermaid -%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#5C4B9B', 'edgeLabelBackground':'#ffffff', 'tertiaryColor': '#ffffff', 'primaryBorderColor': '#5C4B9B', 'fontFamily': 'Arial', 'fontSize': '16px', 'textColor': '#5C4B9B' }}}%% -graph LR - A[DeepSearch Graph] --> F[Use Existing Chromium Instances] - F --> B[Page Caching] - B --> C[Screenshot Scraping] - C --> D[Handle Dynamic Content] - D --> E[New Webdrivers] - - style A fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - style F fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - style B fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - style C fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - style D fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - style E fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10 - - click A href "https://github.com/VinciGit00/Scrapegraph-ai/issues/260" "Open DeepSearch Graph Issue" - click F href "https://github.com/VinciGit00/Scrapegraph-ai/issues/329" "Open Chromium Instances Issue" - click B href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Page Caching Issue" - click C href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Screenshot Scraping Issue" - click D href "https://github.com/VinciGit00/Scrapegraph-ai/issues/279" "Open Handle Dynamic Content Issue" - click E href "https://github.com/VinciGit00/Scrapegraph-ai/issues/171" "Open New Webdrivers Issue" -``` - ## πŸ“ˆ Telemetry We collect anonymous usage metrics to enhance our package's quality and user experience. The data helps us prioritize improvements and ensure compatibility. If you wish to opt-out, set the environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED=false. For more information, please refer to the documentation [here](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html). diff --git a/docs/assets/scrapedo.png b/docs/assets/scrapedo.png new file mode 100644 index 00000000..d15ea04e Binary files /dev/null and b/docs/assets/scrapedo.png differ diff --git a/pyproject.toml b/pyproject.toml index 84201a5f..403201ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.26.0b3" +version = "1.25.2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -30,6 +30,7 @@ dependencies = [ "undetected-playwright>=0.3.0", "google>=3.0.0", "langchain-ollama>=0.1.3", + "semchunk==2.2.0", "transformers==4.44.2", "qdrant-client>=1.11.3", diff --git a/scrapegraphai/utils/cleanup_code.py b/scrapegraphai/utils/cleanup_code.py index ad3d437b..7eedde4d 100644 --- a/scrapegraphai/utils/cleanup_code.py +++ b/scrapegraphai/utils/cleanup_code.py @@ -4,6 +4,9 @@ import re def extract_code(code: str) -> str: + """ + Module for extracting code + """ pattern = r'```(?:python)?\n(.*?)```' match = re.search(pattern, code, re.DOTALL) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 832f811e..2ec3b140 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -101,7 +101,7 @@ def reduce_html(html, reduction): for attr in list(tag.attrs): if attr not in attrs_to_keep: del tag[attr] - + if reduction == 1: return minify_html(str(soup)) diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py index ac955502..62b56eb9 100644 --- a/scrapegraphai/utils/code_error_analysis.py +++ b/scrapegraphai/utils/code_error_analysis.py @@ -2,16 +2,17 @@ This module contains the functions that are used to generate the prompts for the code error analysis. """ from typing import Any, Dict +import json from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser -import json from ..prompts import ( TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_SEMANTIC_ANALYSIS ) def syntax_focused_analysis(state: dict, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"]) + prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, + input_variables=["generated_code", "errors"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "generated_code": state["generated_code"], @@ -19,7 +20,9 @@ def syntax_focused_analysis(state: dict, llm_model) -> str: }) def execution_focused_analysis(state: dict, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"]) + prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, + input_variables=["generated_code", "errors", + "html_code", "html_analysis"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "generated_code": state["generated_code"], @@ -29,7 +32,9 @@ def execution_focused_analysis(state: dict, llm_model) -> str: }) def validation_focused_analysis(state: dict, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"]) + prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, + input_variables=["generated_code", "errors", + "json_schema", "execution_result"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "generated_code": state["generated_code"], @@ -39,7 +44,9 @@ def validation_focused_analysis(state: dict, llm_model) -> str: }) def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"]) + prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, + input_variables=["generated_code", + "differences", "explanation"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "generated_code": state["generated_code"], diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py index 52e92e4c..d70c4eef 100644 --- a/scrapegraphai/utils/code_error_correction.py +++ b/scrapegraphai/utils/code_error_correction.py @@ -10,7 +10,8 @@ ) def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"]) + prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, + input_variables=["analysis", "generated_code"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "analysis": analysis, @@ -18,7 +19,8 @@ def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str }) def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"]) + prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, + input_variables=["analysis", "generated_code"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "analysis": analysis, @@ -26,16 +28,20 @@ def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> }) def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"]) + prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, + input_variables=["analysis", "generated_code", + "json_schema"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "analysis": analysis, "generated_code": state["generated_code"], "json_schema": state["json_schema"] }) - + def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> str: - prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"]) + prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, + input_variables=["analysis", "generated_code", + "generated_result", "reference_result"]) chain = prompt | llm_model | StrOutputParser() return chain.invoke({ "analysis": analysis,