|  | 
| 11 | 11 | import sys | 
| 12 | 12 | import time | 
| 13 | 13 | import traceback | 
| 14 |  | -from functools import cache as memoize | 
| 15 | 14 | from functools import wraps | 
| 16 | 15 | from pathlib import Path | 
| 17 | 16 | from loguru import logger | 
|  | 
| 23 | 22 | import dill | 
| 24 | 23 | import ffmpeg | 
| 25 | 24 | import ftfy | 
| 26 |  | -import goose3 | 
| 27 | 25 | import httpx | 
| 28 | 26 | import litellm | 
| 29 | 27 | import LogseqMarkdownParser | 
| 30 |  | -import playwright.sync_api | 
| 31 | 28 | import pydub | 
| 32 | 29 | import requests | 
| 33 | 30 | import uuid6 | 
|  | 
| 37 | 34 | from langchain.text_splitter import TextSplitter | 
| 38 | 35 | from langchain_community.document_loaders import ( | 
| 39 | 36 |     Docx2txtLoader, | 
| 40 |  | -    PlaywrightURLLoader, | 
| 41 |  | -    SeleniumURLLoader, | 
| 42 | 37 |     UnstructuredEPubLoader, | 
| 43 | 38 |     UnstructuredPowerPointLoader, | 
| 44 |  | -    UnstructuredURLLoader, | 
| 45 | 39 |     UnstructuredWordDocumentLoader, | 
| 46 |  | -    WebBaseLoader, | 
| 47 | 40 | ) | 
| 48 | 41 | from prompt_toolkit import prompt | 
| 49 | 42 | 
 | 
|  | 
| 67 | 60 |     wpm, | 
| 68 | 61 | ) | 
| 69 | 62 | from wdoc.utils.errors import TimeoutPdfLoaderError | 
| 70 |  | -from .shared import debug_return_empty | 
|  | 63 | +from .shared import debug_return_empty, markdownimage_regex | 
| 71 | 64 | 
 | 
| 72 | 65 | try: | 
| 73 | 66 |     import torchaudio | 
|  | 
| 108 | 101 | 
 | 
| 109 | 102 | markdownlink_regex = re.compile(r"\[.*?\]\((.*?)\)")  # to find markdown links | 
| 110 | 103 | # to replace markdown links by their text | 
| 111 |  | -markdownlinkparser_regex = re.compile(r"\[([^\]]+)\]\(http[s]?://[^)]+\)") | 
| 112 | 104 | # to remove image from jina reader that take a lot of tokens but are not yet used | 
| 113 |  | -markdownimage_regex = re.compile( | 
| 114 |  | -    r"!\[([^\]]*)\]\s*(\([^\)]+\)|\[[^\]]+\])", flags=re.MULTILINE | 
| 115 |  | -) | 
| 116 |  | - | 
| 117 | 105 | 
 | 
| 118 |  | -def md_shorten_image_name(md_image: re.Match) -> str: | 
| 119 |  | -    "turn a markdown image link into just the name" | 
| 120 |  | -    name = md_image.group(1) | 
| 121 |  | -    if len(name) <= 16: | 
| 122 |  | -        return name | 
| 123 |  | -    else: | 
| 124 |  | -        return name[:8] + "…" + name[-8:] | 
| 125 | 106 | 
 | 
| 126 | 107 | 
 | 
| 127 | 108 | # to check that a youtube link is valid | 
| @@ -529,21 +510,6 @@ def format_args_with_types(arg_names: List[str]) -> str: | 
| 529 | 510 |     return docs | 
| 530 | 511 | 
 | 
| 531 | 512 | 
 | 
| 532 |  | -# Convenience functions ######################### | 
| 533 |  | - | 
| 534 |  | - | 
| 535 |  | -@memoize | 
| 536 |  | -def get_url_title(url: str) -> Union[str, type(None)]: | 
| 537 |  | -    """if the title of the url is not loaded from the loader, trying as last | 
| 538 |  | -    resort with this one""" | 
| 539 |  | -    loader = WebBaseLoader(url, raise_for_status=True) | 
| 540 |  | -    docs = loader.load() | 
| 541 |  | -    if "title" in docs[0].metadata and docs[0].metadata["title"]: | 
| 542 |  | -        return docs[0].metadata["title"] | 
| 543 |  | -    else: | 
| 544 |  | -        return None | 
| 545 |  | - | 
| 546 |  | - | 
| 547 | 513 | # loaders ####################################### | 
| 548 | 514 | 
 | 
| 549 | 515 | 
 | 
| @@ -1630,156 +1596,6 @@ def load_json_dict( | 
| 1630 | 1596 |     return docs | 
| 1631 | 1597 | 
 | 
| 1632 | 1598 | 
 | 
| 1633 |  | -@debug_return_empty | 
| 1634 |  | -@optional_strip_unexp_args | 
| 1635 |  | -@doc_loaders_cache.cache | 
| 1636 |  | -def load_url(path: str, title=None) -> List[Document]: | 
| 1637 |  | -    logger.info(f"Loading url: '{path}'") | 
| 1638 |  | - | 
| 1639 |  | -    # even if loading fails the title might be found so trying to keep | 
| 1640 |  | -    # the first working title across trials | 
| 1641 |  | -    if title == "Untitled": | 
| 1642 |  | -        title = None | 
| 1643 |  | - | 
| 1644 |  | -    loaded_success = False | 
| 1645 |  | -    if not loaded_success: | 
| 1646 |  | -        try: | 
| 1647 |  | -            loader = WebBaseLoader("https://r.jina.ai/" + path, raise_for_status=True) | 
| 1648 |  | -            text = "\n".join([doc.page_content for doc in loader.load()]).strip() | 
| 1649 |  | -            assert text, "Empty text" | 
| 1650 |  | -            if not title: | 
| 1651 |  | -                if text.splitlines()[0].startswith("Title: "): | 
| 1652 |  | -                    title = text.splitlines()[0].replace("Title: ", "", 1) | 
| 1653 |  | -            text = text.split("Markdown Content:", 1)[1] | 
| 1654 |  | -            text = markdownlinkparser_regex.sub(r"\1", text)  # remove links | 
| 1655 |  | -            # remove markdown images for now as caption is disabled so it's just base64 or something like that, keep only a shorten image name | 
| 1656 |  | -            text = markdownimage_regex.sub(md_shorten_image_name, text) | 
| 1657 |  | -            docs = [ | 
| 1658 |  | -                Document( | 
| 1659 |  | -                    page_content=text, | 
| 1660 |  | -                    metadata={ | 
| 1661 |  | -                        "parser": "jinareader", | 
| 1662 |  | -                    }, | 
| 1663 |  | -                ) | 
| 1664 |  | -            ] | 
| 1665 |  | -            if title: | 
| 1666 |  | -                for doc in docs: | 
| 1667 |  | -                    doc.metadata["title"] = title | 
| 1668 |  | -            check_docs_tkn_length(docs, path) | 
| 1669 |  | -            loaded_success = True | 
| 1670 |  | -        except Exception as err: | 
| 1671 |  | -            logger.warning(f"Exception when using jina reader to parse url: '{err}'") | 
| 1672 |  | - | 
| 1673 |  | -    if not loaded_success: | 
| 1674 |  | -        try: | 
| 1675 |  | -            loader = PlaywrightURLLoader( | 
| 1676 |  | -                urls=[path], remove_selectors=["header", "footer"] | 
| 1677 |  | -            ) | 
| 1678 |  | -            docs = loader.load() | 
| 1679 |  | -            assert docs, "Empty docs when using playwright" | 
| 1680 |  | -            if not title and "title" in docs[0].metadata: | 
| 1681 |  | -                title = docs[0].metadata["title"] | 
| 1682 |  | -            check_docs_tkn_length(docs, path) | 
| 1683 |  | -            loaded_success = True | 
| 1684 |  | -        except Exception as err: | 
| 1685 |  | -            logger.warning(f"Exception when using playwright to parse url: '{err}'") | 
| 1686 |  | - | 
| 1687 |  | -    if not loaded_success: | 
| 1688 |  | -        try: | 
| 1689 |  | -            loader = SeleniumURLLoader(urls=[path], browser="firefox") | 
| 1690 |  | -            docs = loader.load() | 
| 1691 |  | -            assert docs, "Empty docs when using selenium firefox" | 
| 1692 |  | -            if ( | 
| 1693 |  | -                not title | 
| 1694 |  | -                and "title" in docs[0].metadata | 
| 1695 |  | -                and docs[0].metadata["title"] != "No title found." | 
| 1696 |  | -            ): | 
| 1697 |  | -                title = docs[0].metadata["title"] | 
| 1698 |  | -            check_docs_tkn_length(docs, path) | 
| 1699 |  | -            loaded_success = True | 
| 1700 |  | -        except Exception as err: | 
| 1701 |  | -            logger.warning( | 
| 1702 |  | -                f"Exception when using selenium firefox to parse url: '{err}'" | 
| 1703 |  | -            ) | 
| 1704 |  | - | 
| 1705 |  | -    if not loaded_success: | 
| 1706 |  | -        try: | 
| 1707 |  | -            loader = SeleniumURLLoader(urls=[path], browser="chrome") | 
| 1708 |  | -            docs = loader.load() | 
| 1709 |  | -            assert docs, "Empty docs when using selenium chrome" | 
| 1710 |  | -            if ( | 
| 1711 |  | -                not title | 
| 1712 |  | -                and "title" in docs[0].metadata | 
| 1713 |  | -                and docs[0].metadata["title"] != "No title found." | 
| 1714 |  | -            ): | 
| 1715 |  | -                title = docs[0].metadata["title"] | 
| 1716 |  | -            check_docs_tkn_length(docs, path) | 
| 1717 |  | -            loaded_success = True | 
| 1718 |  | -        except Exception as err: | 
| 1719 |  | -            logger.warning( | 
| 1720 |  | -                f"Exception when using selenium chrome to parse url: '{err}'\nUsing goose as fallback" | 
| 1721 |  | -            ) | 
| 1722 |  | - | 
| 1723 |  | -    if not loaded_success: | 
| 1724 |  | -        try: | 
| 1725 |  | -            g = goose3.Goose() | 
| 1726 |  | -            article = g.extract(url=path) | 
| 1727 |  | -            text = article.cleaned_text | 
| 1728 |  | -            docs = [Document(page_content=text)] | 
| 1729 |  | -            assert docs, "Empty docs when using goose" | 
| 1730 |  | -            if not title: | 
| 1731 |  | -                if "title" in docs[0].metadata and docs[0].metadata["title"]: | 
| 1732 |  | -                    title = docs[0].metadata["title"] | 
| 1733 |  | -                elif article.title: | 
| 1734 |  | -                    title = article.title | 
| 1735 |  | -            check_docs_tkn_length(docs, path) | 
| 1736 |  | -            loaded_success = True | 
| 1737 |  | -        except Exception as err: | 
| 1738 |  | -            logger.warning(f"Exception when using goose to parse url: '{err}'") | 
| 1739 |  | - | 
| 1740 |  | -    if not loaded_success: | 
| 1741 |  | -        try: | 
| 1742 |  | -            loader = UnstructuredURLLoader([path]) | 
| 1743 |  | -            docs = loader.load() | 
| 1744 |  | -            assert docs, "Empty docs when using UnstructuredURLLoader" | 
| 1745 |  | -            if not title and "title" in docs[0].metadata and docs[0].metadata["title"]: | 
| 1746 |  | -                title = docs[0].metadata["title"] | 
| 1747 |  | -            check_docs_tkn_length(docs, path) | 
| 1748 |  | -            loaded_success = True | 
| 1749 |  | -        except Exception as err: | 
| 1750 |  | -            logger.warning( | 
| 1751 |  | -                f"Exception when using UnstructuredURLLoader to parse url: '{err}'" | 
| 1752 |  | -            ) | 
| 1753 |  | - | 
| 1754 |  | -    if not loaded_success: | 
| 1755 |  | -        try: | 
| 1756 |  | -            loader = WebBaseLoader(path, raise_for_status=True) | 
| 1757 |  | -            docs = loader.load() | 
| 1758 |  | -            assert docs, "Empty docs when using html" | 
| 1759 |  | -            if not title and "title" in docs[0].metadata and docs[0].metadata["title"]: | 
| 1760 |  | -                title = docs[0].metadata["title"] | 
| 1761 |  | -            check_docs_tkn_length(docs, path) | 
| 1762 |  | -            loaded_success = True | 
| 1763 |  | -        except Exception as err: | 
| 1764 |  | -            logger.warning( | 
| 1765 |  | -                f"Exception when using html as LAST RESORT to parse url: '{err}'" | 
| 1766 |  | -            ) | 
| 1767 |  | - | 
| 1768 |  | -    # last resort, try to get the title from the most basic loader | 
| 1769 |  | -    if not title: | 
| 1770 |  | -        title = get_url_title(path) | 
| 1771 |  | - | 
| 1772 |  | -    # store the title as metadata if missing | 
| 1773 |  | -    if title: | 
| 1774 |  | -        for d in docs: | 
| 1775 |  | -            if "title" not in d.metadata or not d.metadata["title"]: | 
| 1776 |  | -                d.metadata["title"] = title | 
| 1777 |  | -            else: | 
| 1778 |  | -                if d.metadata["title"] != title: | 
| 1779 |  | -                    d.metadata["title"] = f"{title} - {d.metadata['title']}" | 
| 1780 |  | - | 
| 1781 |  | -    return docs | 
| 1782 |  | - | 
| 1783 | 1599 | 
 | 
| 1784 | 1600 | @debug_return_empty | 
| 1785 | 1601 | @optional_strip_unexp_args | 
|  | 
0 commit comments