diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb79a8a7..e09ddccc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## [1.25.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.1...v1.25.2) (2024-10-03)
+
+
+### Bug Fixes
+
+* update dependencies ([7579d0e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7579d0e2599d63c0003b1b7a0918132511a9c8f1))
+
+## [1.25.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.25.1) (2024-09-29)
## [1.26.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.2...v1.26.0-beta.3) (2024-10-04)
diff --git a/README.md b/README.md
index 5d79bf55..94beb617 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,6 @@ The output will be a dictionary like the following:
"contact_email": "contact@scrapegraphai.com"
}
```
-
There are other pipelines that can be used to extract information from multiple pages, generate Python scripts, or even generate audio files.
| Pipeline Name | Description |
@@ -110,6 +109,8 @@ There are other pipelines that can be used to extract information from multiple
| SmartScraperMultiGraph | Multi-page scraper that extracts information from multiple pages given a single prompt and a list of sources. |
| ScriptCreatorMultiGraph | Multi-page scraper that generates a Python script for extracting information from multiple pages and sources. |
+For each of these graphs there is the multi version. It allows to make calls of the LLM in parallel.
+
It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**, **Azure** and **Gemini**, or local models using **Ollama**.
Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command, if you want to use local models.
@@ -140,6 +141,9 @@ Check out also the Docusaurus [here](https://scrapegraph-doc.onrender.com/).
+
+
+
## π€ Contributing
@@ -152,34 +156,6 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra
[](https://www.linkedin.com/company/scrapegraphai/)
[](https://twitter.com/scrapegraphai)
-## πΊοΈ Roadmap
-
-We are working on the following features! If you are interested in collaborating right-click on the feature and open in a new tab to file a PR. If you have doubts and wanna discuss them with us, just contact us on [discord](https://discord.gg/uJN7TYcpNa) or open a [Discussion](https://github.com/VinciGit00/Scrapegraph-ai/discussions) here on Github!
-
-```mermaid
-%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#5C4B9B', 'edgeLabelBackground':'#ffffff', 'tertiaryColor': '#ffffff', 'primaryBorderColor': '#5C4B9B', 'fontFamily': 'Arial', 'fontSize': '16px', 'textColor': '#5C4B9B' }}}%%
-graph LR
- A[DeepSearch Graph] --> F[Use Existing Chromium Instances]
- F --> B[Page Caching]
- B --> C[Screenshot Scraping]
- C --> D[Handle Dynamic Content]
- D --> E[New Webdrivers]
-
- style A fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
- style F fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
- style B fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
- style C fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
- style D fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
- style E fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-
- click A href "https://github.com/VinciGit00/Scrapegraph-ai/issues/260" "Open DeepSearch Graph Issue"
- click F href "https://github.com/VinciGit00/Scrapegraph-ai/issues/329" "Open Chromium Instances Issue"
- click B href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Page Caching Issue"
- click C href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Screenshot Scraping Issue"
- click D href "https://github.com/VinciGit00/Scrapegraph-ai/issues/279" "Open Handle Dynamic Content Issue"
- click E href "https://github.com/VinciGit00/Scrapegraph-ai/issues/171" "Open New Webdrivers Issue"
-```
-
## π Telemetry
We collect anonymous usage metrics to enhance our package's quality and user experience. The data helps us prioritize improvements and ensure compatibility. If you wish to opt-out, set the environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED=false. For more information, please refer to the documentation [here](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html).
diff --git a/docs/assets/scrapedo.png b/docs/assets/scrapedo.png
new file mode 100644
index 00000000..d15ea04e
Binary files /dev/null and b/docs/assets/scrapedo.png differ
diff --git a/pyproject.toml b/pyproject.toml
index 84201a5f..403201ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
[project]
name = "scrapegraphai"
-version = "1.26.0b3"
+version = "1.25.2"
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
authors = [
@@ -30,6 +30,7 @@ dependencies = [
"undetected-playwright>=0.3.0",
"google>=3.0.0",
"langchain-ollama>=0.1.3",
+
"semchunk==2.2.0",
"transformers==4.44.2",
"qdrant-client>=1.11.3",
diff --git a/scrapegraphai/utils/cleanup_code.py b/scrapegraphai/utils/cleanup_code.py
index ad3d437b..7eedde4d 100644
--- a/scrapegraphai/utils/cleanup_code.py
+++ b/scrapegraphai/utils/cleanup_code.py
@@ -4,6 +4,9 @@
import re
def extract_code(code: str) -> str:
+ """
+ Module for extracting code
+ """
pattern = r'```(?:python)?\n(.*?)```'
match = re.search(pattern, code, re.DOTALL)
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index 832f811e..2ec3b140 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -101,7 +101,7 @@ def reduce_html(html, reduction):
for attr in list(tag.attrs):
if attr not in attrs_to_keep:
del tag[attr]
-
+
if reduction == 1:
return minify_html(str(soup))
diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py
index ac955502..62b56eb9 100644
--- a/scrapegraphai/utils/code_error_analysis.py
+++ b/scrapegraphai/utils/code_error_analysis.py
@@ -2,16 +2,17 @@
This module contains the functions that are used to generate the prompts for the code error analysis.
"""
from typing import Any, Dict
+import json
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
-import json
from ..prompts import (
TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_EXECUTION_ANALYSIS,
TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_SEMANTIC_ANALYSIS
)
def syntax_focused_analysis(state: dict, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
+ prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS,
+ input_variables=["generated_code", "errors"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
@@ -19,7 +20,9 @@ def syntax_focused_analysis(state: dict, llm_model) -> str:
})
def execution_focused_analysis(state: dict, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
+ prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS,
+ input_variables=["generated_code", "errors",
+ "html_code", "html_analysis"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
@@ -29,7 +32,9 @@ def execution_focused_analysis(state: dict, llm_model) -> str:
})
def validation_focused_analysis(state: dict, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
+ prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS,
+ input_variables=["generated_code", "errors",
+ "json_schema", "execution_result"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
@@ -39,7 +44,9 @@ def validation_focused_analysis(state: dict, llm_model) -> str:
})
def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
+ prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS,
+ input_variables=["generated_code",
+ "differences", "explanation"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py
index 52e92e4c..d70c4eef 100644
--- a/scrapegraphai/utils/code_error_correction.py
+++ b/scrapegraphai/utils/code_error_correction.py
@@ -10,7 +10,8 @@
)
def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+ prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION,
+ input_variables=["analysis", "generated_code"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
@@ -18,7 +19,8 @@ def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str
})
def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+ prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION,
+ input_variables=["analysis", "generated_code"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
@@ -26,16 +28,20 @@ def execution_focused_code_generation(state: dict, analysis: str, llm_model) ->
})
def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
+ prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION,
+ input_variables=["analysis", "generated_code",
+ "json_schema"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
"generated_code": state["generated_code"],
"json_schema": state["json_schema"]
})
-
+
def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
- prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
+ prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION,
+ input_variables=["analysis", "generated_code",
+ "generated_result", "reference_result"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,