@@ -959,6 +959,28 @@ def dev_symlink(www_root: Path, group):
959
959
for language in LANGUAGES :
960
960
symlink (www_root , language , current_dev , "dev" , group )
961
961
962
+ def proofread_canonicals (www_root : Path ) -> None :
963
+ """In www_root we check that all canonical links point to existing contents.
964
+
965
+ It can happen that a canonical is "broken":
966
+
967
+ - /3.11/whatsnew/3.11.html typically would link to
968
+ /3/whatsnew/3.11.html, which may not exist yet.
969
+ """
970
+ canonical_re = re .compile (
971
+ """<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
972
+ )
973
+ for file in www_root .glob ("**/*.html" ):
974
+ html = file .read_text (encoding = "UTF-8" )
975
+ canonical = canonical_re .search (html )
976
+ if not canonical :
977
+ continue
978
+ target = canonical .group (1 )
979
+ if not (www_root / target ).exists ():
980
+ logging .info ("Removing broken canonical from %s to %s" , file , target )
981
+ html = html .replace (canonical .group (0 ), "" )
982
+ file .write_text (html , encoding = "UTF-8" )
983
+
962
984
963
985
def main ():
964
986
"""Script entry point."""
@@ -991,6 +1013,7 @@ def main():
991
1013
build_robots_txt (args .www_root , args .group , args .skip_cache_invalidation )
992
1014
major_symlinks (args .www_root , args .group )
993
1015
dev_symlink (args .www_root , args .group )
1016
+ proofread_canonicals (args .www_root )
994
1017
995
1018
996
1019
if __name__ == "__main__" :
0 commit comments