Skip to content

Commit 40ce6d1

Browse files
committed
Find and remove broken canonicals.
1 parent 83ea484 commit 40ce6d1

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

build_docs.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,28 @@ def dev_symlink(www_root: Path, group):
959959
for language in LANGUAGES:
960960
symlink(www_root, language, current_dev, "dev", group)
961961

962+
def proofread_canonicals(www_root: Path) -> None:
963+
"""In www_root we check that all canonical links point to existing contents.
964+
965+
It can happen that a canonical is "broken":
966+
967+
- /3.11/whatsnew/3.11.html typically would link to
968+
/3/whatsnew/3.11.html, which may not exist yet.
969+
"""
970+
canonical_re = re.compile(
971+
"""<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
972+
)
973+
for file in www_root.glob("**/*.html"):
974+
html = file.read_text(encoding="UTF-8")
975+
canonical = canonical_re.search(html)
976+
if not canonical:
977+
continue
978+
target = canonical.group(1)
979+
if not (www_root / target).exists():
980+
logging.info("Removing broken canonical from %s to %s", file, target)
981+
html = html.replace(canonical.group(0), "")
982+
file.write_text(html, encoding="UTF-8")
983+
962984

963985
def main():
964986
"""Script entry point."""
@@ -991,6 +1013,7 @@ def main():
9911013
build_robots_txt(args.www_root, args.group, args.skip_cache_invalidation)
9921014
major_symlinks(args.www_root, args.group)
9931015
dev_symlink(args.www_root, args.group)
1016+
proofread_canonicals(args.www_root)
9941017

9951018

9961019
if __name__ == "__main__":

0 commit comments

Comments
 (0)