move load_url to url.py

thiswillbeyourgithub · thiswillbeyourgithub · commit b2490688000a · 2025-09-13T11:31:14.000+02:00
diff --git a/wdoc/utils/loaders/__init__.py b/wdoc/utils/loaders/__init__.py
@@ -11,7 +11,6 @@
 import sys
 import time
 import traceback
-from functools import cache as memoize
 from functools import wraps
 from pathlib import Path
 from loguru import logger
@@ -23,11 +22,9 @@
 import dill
 import ffmpeg
 import ftfy
-import goose3
 import httpx
 import litellm
 import LogseqMarkdownParser
-import playwright.sync_api
 import pydub
 import requests
 import uuid6
@@ -37,13 +34,9 @@
 from langchain.text_splitter import TextSplitter
 from langchain_community.document_loaders import (
     Docx2txtLoader,
-    PlaywrightURLLoader,
-    SeleniumURLLoader,
     UnstructuredEPubLoader,
     UnstructuredPowerPointLoader,
-    UnstructuredURLLoader,
     UnstructuredWordDocumentLoader,
-    WebBaseLoader,
 )
 from prompt_toolkit import prompt
 
@@ -67,7 +60,7 @@
     wpm,
 )
 from wdoc.utils.errors import TimeoutPdfLoaderError
-from .shared import debug_return_empty
+from .shared import debug_return_empty, markdownimage_regex
 
 try:
     import torchaudio
@@ -108,20 +101,8 @@
 
 markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)")  # to find markdown links
 # to replace markdown links by their text
-markdownlinkparser_regex = re.compile(r"\[([^\]]+)\]\(http[s]?://[^)]+\)")
 # to remove image from jina reader that take a lot of tokens but are not yet used
-markdownimage_regex = re.compile(
-    r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE
-)
-
 
-def md_shorten_image_name(md_image: re.Match) -> str:
-    "turn a markdown image link into just the name"
-    name = md_image.group(1)
-    if len(name) <= 16:
-        return name
-    else:
-        return name[:8] + "…" + name[-8:]
 
 
 # to check that a youtube link is valid
@@ -529,21 +510,6 @@ def format_args_with_types(arg_names: List[str]) -> str:
     return docs
 
 
-# Convenience functions #########################
-
-
-@memoize
-def get_url_title(url: str) -> Union[str, type(None)]:
-    """if the title of the url is not loaded from the loader, trying as last
-    resort with this one"""
-    loader = WebBaseLoader(url, raise_for_status=True)
-    docs = loader.load()
-    if "title" in docs[0].metadata and docs[0].metadata["title"]:
-        return docs[0].metadata["title"]
-    else:
-        return None
-
-
 # loaders #######################################
 
 
@@ -1630,156 +1596,6 @@ def load_json_dict(
     return docs
 
 
-@debug_return_empty
-@optional_strip_unexp_args
-@doc_loaders_cache.cache
-def load_url(path: str, title=None) -> List[Document]:
-    logger.info(f"Loading url: '{path}'")
-
-    # even if loading fails the title might be found so trying to keep
-    # the first working title across trials
-    if title == "Untitled":
-        title = None
-
-    loaded_success = False
-    if not loaded_success:
-        try:
-            loader = WebBaseLoader("https://r.jina.ai/" + path, raise_for_status=True)
-            text = "\n".join([doc.page_content for doc in loader.load()]).strip()
-            assert text, "Empty text"
-            if not title:
-                if text.splitlines()[0].startswith("Title: "):
-                    title = text.splitlines()[0].replace("Title: ", "", 1)
-            text = text.split("Markdown Content:", 1)[1]
-            text = markdownlinkparser_regex.sub(r"\1", text)  # remove links
-            # remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name
-            text = markdownimage_regex.sub(md_shorten_image_name, text)
-            docs = [
-                Document(
-                    page_content=text,
-                    metadata={
-                        "parser": "jinareader",
-                    },
-                )
-            ]
-            if title:
-                for doc in docs:
-                    doc.metadata["title"] = title
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(f"Exception when using jina reader to parse url: '{err}'")
-
-    if not loaded_success:
-        try:
-            loader = PlaywrightURLLoader(
-                urls=[path], remove_selectors=["header", "footer"]
-            )
-            docs = loader.load()
-            assert docs, "Empty docs when using playwright"
-            if not title and "title" in docs[0].metadata:
-                title = docs[0].metadata["title"]
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(f"Exception when using playwright to parse url: '{err}'")
-
-    if not loaded_success:
-        try:
-            loader = SeleniumURLLoader(urls=[path], browser="firefox")
-            docs = loader.load()
-            assert docs, "Empty docs when using selenium firefox"
-            if (
-                not title
-                and "title" in docs[0].metadata
-                and docs[0].metadata["title"] != "No title found."
-            ):
-                title = docs[0].metadata["title"]
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(
-                f"Exception when using selenium firefox to parse url: '{err}'"
-            )
-
-    if not loaded_success:
-        try:
-            loader = SeleniumURLLoader(urls=[path], browser="chrome")
-            docs = loader.load()
-            assert docs, "Empty docs when using selenium chrome"
-            if (
-                not title
-                and "title" in docs[0].metadata
-                and docs[0].metadata["title"] != "No title found."
-            ):
-                title = docs[0].metadata["title"]
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(
-                f"Exception when using selenium chrome to parse url: '{err}'\nUsing goose as fallback"
-            )
-
-    if not loaded_success:
-        try:
-            g = goose3.Goose()
-            article = g.extract(url=path)
-            text = article.cleaned_text
-            docs = [Document(page_content=text)]
-            assert docs, "Empty docs when using goose"
-            if not title:
-                if "title" in docs[0].metadata and docs[0].metadata["title"]:
-                    title = docs[0].metadata["title"]
-                elif article.title:
-                    title = article.title
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(f"Exception when using goose to parse url: '{err}'")
-
-    if not loaded_success:
-        try:
-            loader = UnstructuredURLLoader([path])
-            docs = loader.load()
-            assert docs, "Empty docs when using UnstructuredURLLoader"
-            if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
-                title = docs[0].metadata["title"]
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(
-                f"Exception when using UnstructuredURLLoader to parse url: '{err}'"
-            )
-
-    if not loaded_success:
-        try:
-            loader = WebBaseLoader(path, raise_for_status=True)
-            docs = loader.load()
-            assert docs, "Empty docs when using html"
-            if not title and "title" in docs[0].metadata and docs[0].metadata["title"]:
-                title = docs[0].metadata["title"]
-            check_docs_tkn_length(docs, path)
-            loaded_success = True
-        except Exception as err:
-            logger.warning(
-                f"Exception when using html as LAST RESORT to parse url: '{err}'"
-            )
-
-    # last resort, try to get the title from the most basic loader
-    if not title:
-        title = get_url_title(path)
-
-    # store the title as metadata if missing
-    if title:
-        for d in docs:
-            if "title" not in d.metadata or not d.metadata["title"]:
-                d.metadata["title"] = title
-            else:
-                if d.metadata["title"] != title:
-                    d.metadata["title"] = f"{title} - {d.metadata['title']}"
-
-    return docs
-
 
 @debug_return_empty
 @optional_strip_unexp_args
diff --git a/wdoc/utils/loaders/shared.py b/wdoc/utils/loaders/shared.py
@@ -7,6 +7,10 @@
 
 from wdoc.utils.env import env
 
+markdownimage_regex = re.compile(
+    r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE
+)
+
 
 def debug_return_empty(func: Callable) -> Callable:
     if env.WDOC_EMPTY_LOADER:
@@ -57,3 +61,15 @@ def signal_handler(signum, frame):
         finally:
             # Disable the alarm
             signal.alarm(0)
+
+
+@memoize
+def get_url_title(url: str) -> Union[str, type(None)]:
+    """if the title of the url is not loaded from the loader, trying as last
+    resort with this one"""
+    loader = WebBaseLoader(url, raise_for_status=True)
+    docs = loader.load()
+    if "title" in docs[0].metadata and docs[0].metadata["title"]:
+        return docs[0].metadata["title"]
+    else:
+        return None
diff --git a/wdoc/utils/loaders/url.py b/wdoc/utils/loaders/url.py