Skip to content

Commit b249068

Browse files
move load_url to url.py
1 parent 7fc5fad commit b249068

File tree

3 files changed

+201
-185
lines changed

3 files changed

+201
-185
lines changed

wdoc/utils/loaders/__init__.py

Lines changed: 1 addition & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import sys
1212
import time
1313
import traceback
14-
from functools import cache as memoize
1514
from functools import wraps
1615
from pathlib import Path
1716
from loguru import logger
@@ -23,11 +22,9 @@
2322
import dill
2423
import ffmpeg
2524
import ftfy
26-
import goose3
2725
import httpx
2826
import litellm
2927
import LogseqMarkdownParser
30-
import playwright.sync_api
3128
import pydub
3229
import requests
3330
import uuid6
@@ -37,13 +34,9 @@
3734
from langchain.text_splitter import TextSplitter
3835
from langchain_community.document_loaders import (
3936
Docx2txtLoader,
40-
PlaywrightURLLoader,
41-
SeleniumURLLoader,
4237
UnstructuredEPubLoader,
4338
UnstructuredPowerPointLoader,
44-
UnstructuredURLLoader,
4539
UnstructuredWordDocumentLoader,
46-
WebBaseLoader,
4740
)
4841
from prompt_toolkit import prompt
4942

@@ -67,7 +60,7 @@
6760
wpm,
6861
)
6962
from wdoc.utils.errors import TimeoutPdfLoaderError
70-
from .shared import debug_return_empty
63+
from .shared import debug_return_empty, markdownimage_regex
7164

7265
try:
7366
import torchaudio
@@ -108,20 +101,8 @@
108101

109102
markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)") # to find markdown links
110103
# to replace markdown links by their text
111-
markdownlinkparser_regex = re.compile(r"\[([^\]]+)\]\(http[s]?://[^)]+\)")
112104
# to remove image from jina reader that take a lot of tokens but are not yet used
113-
markdownimage_regex = re.compile(
114-
r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE
115-
)
116-
117105

118-
def md_shorten_image_name(md_image: re.Match) -> str:
119-
"turn a markdown image link into just the name"
120-
name = md_image.group(1)
121-
if len(name) <= 16:
122-
return name
123-
else:
124-
return name[:8] + "…" + name[-8:]
125106

126107

127108
# to check that a youtube link is valid
@@ -529,21 +510,6 @@ def format_args_with_types(arg_names: List[str]) -> str:
529510
return docs
530511

531512

532-
# Convenience functions #########################
533-
534-
535-
@memoize
536-
def get_url_title(url: str) -> Union[str, type(None)]:
537-
"""if the title of the url is not loaded from the loader, trying as last
538-
resort with this one"""
539-
loader = WebBaseLoader(url, raise_for_status=True)
540-
docs = loader.load()
541-
if "title" in docs[0].metadata and docs[0].metadata["title"]:
542-
return docs[0].metadata["title"]
543-
else:
544-
return None
545-
546-
547513
# loaders #######################################
548514

549515

@@ -1630,156 +1596,6 @@ def load_json_dict(
16301596
return docs
16311597

16321598

1633-
@debug_return_empty
1634-
@optional_strip_unexp_args
1635-
@doc_loaders_cache.cache
1636-
def load_url(path: str, title=None) -> List[Document]:
1637-
logger.info(f"Loading url: '{path}'")
1638-
1639-
# even if loading fails the title might be found so trying to keep
1640-
# the first working title across trials
1641-
if title == "Untitled":
1642-
title = None
1643-
1644-
loaded_success = False
1645-
if not loaded_success:
1646-
try:
1647-
loader = WebBaseLoader("https://r.jina.ai/" + path, raise_for_status=True)
1648-
text = "\n".join([doc.page_content for doc in loader.load()]).strip()
1649-
assert text, "Empty text"
1650-
if not title:
1651-
if text.splitlines()[0].startswith("Title: "):
1652-
title = text.splitlines()[0].replace("Title: ", "", 1)
1653-
text = text.split("Markdown Content:", 1)[1]
1654-
text = markdownlinkparser_regex.sub(r"\1", text) # remove links
1655-
# remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name
1656-
text = markdownimage_regex.sub(md_shorten_image_name, text)
1657-
docs = [
1658-
Document(
1659-
page_content=text,
1660-
metadata={
1661-
"parser": "jinareader",
1662-
},
1663-
)
1664-
]
1665-
if title:
1666-
for doc in docs:
1667-
doc.metadata["title"] = title
1668-
check_docs_tkn_length(docs, path)
1669-
loaded_success = True
1670-
except Exception as err:
1671-
logger.warning(f"Exception when using jina reader to parse url: '{err}'")
1672-
1673-
if not loaded_success:
1674-
try:
1675-
loader = PlaywrightURLLoader(
1676-
urls=[path], remove_selectors=["header", "footer"]
1677-
)
1678-
docs = loader.load()
1679-
assert docs, "Empty docs when using playwright"
1680-
if not title and "title" in docs[0].metadata:
1681-
title = docs[0].metadata["title"]
1682-
check_docs_tkn_length(docs, path)
1683-
loaded_success = True
1684-
except Exception as err:
1685-
logger.warning(f"Exception when using playwright to parse url: '{err}'")
1686-
1687-
if not loaded_success:
1688-
try:
1689-
loader = SeleniumURLLoader(urls=[path], browser="firefox")
1690-
docs = loader.load()
1691-
assert docs, "Empty docs when using selenium firefox"
1692-
if (
1693-
not title
1694-
and "title" in docs[0].metadata
1695-
and docs[0].metadata["title"] != "No title found."
1696-
):
1697-
title = docs[0].metadata["title"]
1698-
check_docs_tkn_length(docs, path)
1699-
loaded_success = True
1700-
except Exception as err:
1701-
logger.warning(
1702-
f"Exception when using selenium firefox to parse url: '{err}'"
1703-
)
1704-
1705-
if not loaded_success:
1706-
try:
1707-
loader = SeleniumURLLoader(urls=[path], browser="chrome")
1708-
docs = loader.load()
1709-
assert docs, "Empty docs when using selenium chrome"
1710-
if (
1711-
not title
1712-
and "title" in docs[0].metadata
1713-
and docs[0].metadata["title"] != "No title found."
1714-
):
1715-
title = docs[0].metadata["title"]
1716-
check_docs_tkn_length(docs, path)
1717-
loaded_success = True
1718-
except Exception as err:
1719-
logger.warning(
1720-
f"Exception when using selenium chrome to parse url: '{err}'\nUsing goose as fallback"
1721-
)
1722-
1723-
if not loaded_success:
1724-
try:
1725-
g = goose3.Goose()
1726-
article = g.extract(url=path)
1727-
text = article.cleaned_text
1728-
docs = [Document(page_content=text)]
1729-
assert docs, "Empty docs when using goose"
1730-
if not title:
1731-
if "title" in docs[0].metadata and docs[0].metadata["title"]:
1732-
title = docs[0].metadata["title"]
1733-
elif article.title:
1734-
title = article.title
1735-
check_docs_tkn_length(docs, path)
1736-
loaded_success = True
1737-
except Exception as err:
1738-
logger.warning(f"Exception when using goose to parse url: '{err}'")
1739-
1740-
if not loaded_success:
1741-
try:
1742-
loader = UnstructuredURLLoader([path])
1743-
docs = loader.load()
1744-
assert docs, "Empty docs when using UnstructuredURLLoader"
1745-
if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
1746-
title = docs[0].metadata["title"]
1747-
check_docs_tkn_length(docs, path)
1748-
loaded_success = True
1749-
except Exception as err:
1750-
logger.warning(
1751-
f"Exception when using UnstructuredURLLoader to parse url: '{err}'"
1752-
)
1753-
1754-
if not loaded_success:
1755-
try:
1756-
loader = WebBaseLoader(path, raise_for_status=True)
1757-
docs = loader.load()
1758-
assert docs, "Empty docs when using html"
1759-
if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
1760-
title = docs[0].metadata["title"]
1761-
check_docs_tkn_length(docs, path)
1762-
loaded_success = True
1763-
except Exception as err:
1764-
logger.warning(
1765-
f"Exception when using html as LAST RESORT to parse url: '{err}'"
1766-
)
1767-
1768-
# last resort, try to get the title from the most basic loader
1769-
if not title:
1770-
title = get_url_title(path)
1771-
1772-
# store the title as metadata if missing
1773-
if title:
1774-
for d in docs:
1775-
if "title" not in d.metadata or not d.metadata["title"]:
1776-
d.metadata["title"] = title
1777-
else:
1778-
if d.metadata["title"] != title:
1779-
d.metadata["title"] = f"{title} - {d.metadata['title']}"
1780-
1781-
return docs
1782-
17831599

17841600
@debug_return_empty
17851601
@optional_strip_unexp_args

wdoc/utils/loaders/shared.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77

88
from wdoc.utils.env import env
99

10+
markdownimage_regex = re.compile(
11+
r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE
12+
)
13+
1014

1115
def debug_return_empty(func: Callable) -> Callable:
1216
if env.WDOC_EMPTY_LOADER:
@@ -57,3 +61,15 @@ def signal_handler(signum, frame):
5761
finally:
5862
# Disable the alarm
5963
signal.alarm(0)
64+
65+
66+
@memoize
67+
def get_url_title(url: str) -> Union[str, type(None)]:
68+
"""if the title of the url is not loaded from the loader, trying as last
69+
resort with this one"""
70+
loader = WebBaseLoader(url, raise_for_status=True)
71+
docs = loader.load()
72+
if "title" in docs[0].metadata and docs[0].metadata["title"]:
73+
return docs[0].metadata["title"]
74+
else:
75+
return None

0 commit comments

Comments
 (0)