Add support for exclusions

stronk7 · stronk7 · commit 454d2bc244df · 2025-03-03T18:48:23.000+01:00
- Based on MEDIAWIKI_EXCLUDED env variable for now.
- Able to exclude by category matching.
- Able to exclude by wikitext regex matching.
diff --git a/wiki_rag/load/main.py b/wiki_rag/load/main.py
@@ -59,6 +59,20 @@ def main():
         logger.error(f"Data directory {loader_dump_path} not found. Please ensure it exists. Exiting.")
         sys.exit(1)
 
+    # The format for now is a semicolon separated list of "type", colon and comma
+    # separated list of values, for example:
+    # MEDIAWIKI_EXCLUDED="categories:Plugin, Contributed code;wikitext:Hi world, ho world
+    # TODO: Move this also to the config YAML file.
+    excluded = os.getenv("MEDIAWIKI_EXCLUDED")
+    exclusions = {}
+    # Let's process the exclusions and return them in a nice dict.
+    if excluded:
+        for exclusion in excluded.split(";"):
+            exclusion_type, exclusion_values = exclusion.split(":")
+            exclusion_values = [value.strip() for value in exclusion_values.split(",")]
+            exclusions[exclusion_type] = exclusion_values
+    logger.info(f"Applying exclusions: {exclusions}")
+
     collection_name = os.getenv("COLLECTION_NAME")
     if not collection_name:
         logger.error("Collection name not found in environment. Exiting.")
@@ -77,7 +91,7 @@ def main():
     logger.info(f"Loaded {len(pages)} pages.")
 
     logger.info("Parsing and splitting pages")
-    parsed_pages = get_mediawiki_parsed_pages(mediawiki_url, pages, user_agent)
+    parsed_pages = get_mediawiki_parsed_pages(mediawiki_url, pages, user_agent, exclusions)
     logger.info(f"Parsed {len(parsed_pages)} pages.")
 
     logger.info(f"Saving parsed pages to {dump_filename}")
diff --git a/wiki_rag/load/util.py b/wiki_rag/load/util.py
@@ -86,7 +86,12 @@ def get_mediawiki_pages_list(
     return pages
 
 
-def get_mediawiki_parsed_pages(mediawiki_url: str, pages: list[dict], user_agent: str) -> list[dict]:
+def get_mediawiki_parsed_pages(
+        mediawiki_url: str,
+        pages: list[dict],
+        user_agent: str,
+        exclusions: dict[str, list[str]]
+) -> list[dict]:
     """Parse the pages and split them into sections.
 
     :param mediawiki_url: The url of the mediawiki site.
@@ -99,7 +104,9 @@ def get_mediawiki_parsed_pages(mediawiki_url: str, pages: list[dict], user_agent
         time.sleep(random.uniform(2, 3))  # We aren't in a hurry (it's only a few requests).
         try:
             sections, categories, internal_links, external_links, language_links = parse_page(
-                mediawiki_url, page["pageid"], user_agent)  # Parse pages and sections.
+                mediawiki_url, page["pageid"], user_agent, exclusions)  # Parse pages and sections.
+            if not sections:  # Something, maybe an exclusion, caused this page to be skipped.
+                continue
             tidy_sections_text(mediawiki_url, sections, categories, internal_links, external_links,
                                language_links)  # Tidy up contents and links.
             calculate_relationships(sections)  # Calculate all the relationships between sections.
@@ -125,7 +132,7 @@ def get_mediawiki_parsed_pages(mediawiki_url: str, pages: list[dict], user_agent
     return parsed_pages
 
 
-def parse_page(mediawiki_url: str, page_id: int, user_agent: str) -> list:
+def parse_page(mediawiki_url: str, page_id: int, user_agent: str, exclusions: dict[str, list[str]]) -> list:
     """Fetch a page using mediawiki api and process it completely."""
     api_url = f"{mediawiki_url}/api.php"
     headers = {
@@ -148,6 +155,23 @@ def parse_page(mediawiki_url: str, page_id: int, user_agent: str) -> list:
     internal_links = [link["*"] for link in result.json()["parse"]["links"] if "exists" in link]
     external_links = result.json()["parse"]["externallinks"]
     language_links = [f"{lang["lang"]}:{lang["*"]}" for lang in result.json()["parse"]["langlinks"]]
+    wikitext = result.json()["parse"]["wikitext"]["*"]
+
+    # Apply exclusions.
+    for exclusion in exclusions:  # This is a dict with the type and the values to exclude.
+        logger.debug(f"Applying exclusion {exclusion} = {exclusions[exclusion]} to page {title}.")
+        if exclusion == "categories":
+            # If any of the categories is in the exclusion list, we skip the page.
+            if any(cat.replace(" ", "_") in categories for cat in exclusions[exclusion]):
+                logger.info(f"Excluding page {title} due to category exclusion.")
+                return [[], [], [], [], []]
+        elif exclusion == "wikitext":
+            # If the wikitext contains any of the exclusion regexes, we skip the page.
+            if any(re.search(f"{text}", wikitext) for text in exclusions[exclusion]):
+                logger.info(f"Excluding page {title} due to wikitext regex exclusion.")
+                return [[], [], [], [], []]
+        else:
+            logger.error(f"Unknown exclusion type {exclusion}")
 
     # Based on the URL and the page id, create a stable document identifier for the whole page.
     doc_id = uuid.uuid5(uuid.NAMESPACE_URL, f"{mediawiki_url}/{id}")