@@ -86,7 +86,12 @@ def get_mediawiki_pages_list(
8686 return pages
8787
8888
89- def get_mediawiki_parsed_pages (mediawiki_url : str , pages : list [dict ], user_agent : str ) -> list [dict ]:
89+ def get_mediawiki_parsed_pages (
90+ mediawiki_url : str ,
91+ pages : list [dict ],
92+ user_agent : str ,
93+ exclusions : dict [str , list [str ]]
94+ ) -> list [dict ]:
9095 """Parse the pages and split them into sections.
9196
9297 :param mediawiki_url: The url of the mediawiki site.
@@ -99,7 +104,9 @@ def get_mediawiki_parsed_pages(mediawiki_url: str, pages: list[dict], user_agent
99104 time .sleep (random .uniform (2 , 3 )) # We aren't in a hurry (it's only a few requests).
100105 try :
101106 sections , categories , internal_links , external_links , language_links = parse_page (
102- mediawiki_url , page ["pageid" ], user_agent ) # Parse pages and sections.
107+ mediawiki_url , page ["pageid" ], user_agent , exclusions ) # Parse pages and sections.
108+ if not sections : # Something, maybe an exclusion, caused this page to be skipped.
109+ continue
103110 tidy_sections_text (mediawiki_url , sections , categories , internal_links , external_links ,
104111 language_links ) # Tidy up contents and links.
105112 calculate_relationships (sections ) # Calculate all the relationships between sections.
@@ -125,7 +132,7 @@ def get_mediawiki_parsed_pages(mediawiki_url: str, pages: list[dict], user_agent
125132 return parsed_pages
126133
127134
128- def parse_page (mediawiki_url : str , page_id : int , user_agent : str ) -> list :
135+ def parse_page (mediawiki_url : str , page_id : int , user_agent : str , exclusions : dict [ str , list [ str ]] ) -> list :
129136 """Fetch a page using mediawiki api and process it completely."""
130137 api_url = f"{ mediawiki_url } /api.php"
131138 headers = {
@@ -148,6 +155,23 @@ def parse_page(mediawiki_url: str, page_id: int, user_agent: str) -> list:
148155 internal_links = [link ["*" ] for link in result .json ()["parse" ]["links" ] if "exists" in link ]
149156 external_links = result .json ()["parse" ]["externallinks" ]
150157 language_links = [f"{ lang ["lang" ]} :{ lang ["*" ]} " for lang in result .json ()["parse" ]["langlinks" ]]
158+ wikitext = result .json ()["parse" ]["wikitext" ]["*" ]
159+
160+ # Apply exclusions.
161+ for exclusion in exclusions : # This is a dict with the type and the values to exclude.
162+ logger .debug (f"Applying exclusion { exclusion } = { exclusions [exclusion ]} to page { title } ." )
163+ if exclusion == "categories" :
164+ # If any of the categories is in the exclusion list, we skip the page.
165+ if any (cat .replace (" " , "_" ) in categories for cat in exclusions [exclusion ]):
166+ logger .info (f"Excluding page { title } due to category exclusion." )
167+ return [[], [], [], [], []]
168+ elif exclusion == "wikitext" :
169+ # If the wikitext contains any of the exclusion regexes, we skip the page.
170+ if any (re .search (f"{ text } " , wikitext ) for text in exclusions [exclusion ]):
171+ logger .info (f"Excluding page { title } due to wikitext regex exclusion." )
172+ return [[], [], [], [], []]
173+ else :
174+ logger .error (f"Unknown exclusion type { exclusion } " )
151175
152176 # Based on the URL and the page id, create a stable document identifier for the whole page.
153177 doc_id = uuid .uuid5 (uuid .NAMESPACE_URL , f"{ mediawiki_url } /{ id } " )
0 commit comments