@@ -22,22 +22,39 @@ def validate
2222 end
2323
2424 def run_internal_link_checker ( links )
25+ # collect urls and metadata for hashes to be checked in the same target file
26+ file_paths_hashes_to_check = { }
2527 to_add = [ ]
26- links . each_pair do |link , matched_files |
28+ links . each_with_index do |( link , matched_files ) , i |
29+ matched_count_to_log = pluralize ( matched_files . count , "reference" , "references" )
30+ @logger . log ( :debug , "(#{ i + 1 } / #{ links . count } ) Internal link #{ link } : Checking #{ matched_count_to_log } " )
2731 matched_files . each do |metadata |
2832 url = HTMLProofer ::Attribute ::Url . new ( @runner , link , base_url : metadata [ :base_url ] )
2933
3034 @runner . current_source = metadata [ :source ]
3135 @runner . current_filename = metadata [ :filename ]
3236
33- unless file_exists? ( url )
37+ target_file_path = url . absolute_path
38+ unless file_exists? ( target_file_path )
3439 @failed_checks << Failure . new ( @runner . current_filename , "Links > Internal" ,
3540 "internally linking to #{ url } , which does not exist" , line : metadata [ :line ] , status : nil , content : nil )
3641 to_add << [ url , metadata , false ]
3742 next
3843 end
3944
40- unless hash_exists? ( url )
45+ hash_exists = hash_exists_for_url? ( url )
46+ if hash_exists . nil?
47+ # the hash needs to be checked in the target file, we collect the url and metadata
48+ unless file_paths_hashes_to_check . key? ( target_file_path )
49+ file_paths_hashes_to_check [ target_file_path ] = { }
50+ end
51+ unless file_paths_hashes_to_check [ target_file_path ] . key? ( url . hash )
52+ file_paths_hashes_to_check [ target_file_path ] [ url . hash ] = [ ]
53+ end
54+ file_paths_hashes_to_check [ target_file_path ] [ url . hash ] << [ url , metadata ]
55+ next
56+ end
57+ unless hash_exists
4158 @failed_checks << Failure . new ( @runner . current_filename , "Links > Internal" ,
4259 "internally linking to #{ url } ; the file exists, but the hash '#{ url . hash } ' does not" , line : metadata [ :line ] , status : nil , content : nil )
4360 to_add << [ url , metadata , false ]
@@ -48,6 +65,24 @@ def run_internal_link_checker(links)
4865 end
4966 end
5067
68+ # check hashes by target file
69+ @logger . log ( :info , "Checking internal link hashes in #{ pluralize ( file_paths_hashes_to_check . count , "file" , "files" ) } " )
70+ file_paths_hashes_to_check . each_with_index do |( file_path , hashes_to_check ) , i |
71+ hash_count_to_log = pluralize ( hashes_to_check . count , "hash" , "hashes" )
72+ @logger . log ( :debug , "(#{ i + 1 } / #{ file_paths_hashes_to_check . count } ) Checking #{ hash_count_to_log } in #{ file_path } " )
73+ html = create_nokogiri ( file_path )
74+ hashes_to_check . each_pair do |href_hash , url_metadata |
75+ exists = hash_exists_in_html? ( href_hash , html )
76+ url_metadata . each do |( url , metadata ) |
77+ unless exists
78+ @failed_checks << Failure . new ( metadata [ :filename ] , "Links > Internal" ,
79+ "internally linking to #{ url } ; the file exists, but the hash '#{ href_hash } ' does not" , line : metadata [ :line ] , status : nil , content : nil )
80+ end
81+ to_add << [ url , metadata , exists ]
82+ end
83+ end
84+ end
85+
5186 # adding directly to the cache above results in an endless loop
5287 to_add . each do |( url , metadata , exists ) |
5388 @cache . add_internal ( url . to_s , metadata , exists )
@@ -56,15 +91,15 @@ def run_internal_link_checker(links)
5691 @failed_checks
5792 end
5893
59- private def file_exists? ( url )
60- absolute_path = url . absolute_path
61- return @runner . checked_paths [ url . absolute_path ] if @runner . checked_paths . key? ( absolute_path )
94+ private def file_exists? ( absolute_path )
95+ return @runner . checked_paths [ absolute_path ] if @runner . checked_paths . key? ( absolute_path )
6296
63- @runner . checked_paths [ url . absolute_path ] = File . exist? ( absolute_path )
97+ @runner . checked_paths [ absolute_path ] = File . exist? ( absolute_path )
6498 end
6599
66- # verify the target hash
67- private def hash_exists? ( url )
100+ # verify the hash w/o just based on the URL, w/o looking at the target file
101+ # => returns nil if the has could not be verified
102+ private def hash_exists_for_url? ( url )
68103 href_hash = url . hash
69104 return true if blank? ( href_hash )
70105 return true unless @runner . options [ :check_internal_hash ]
@@ -76,10 +111,18 @@ def run_internal_link_checker(links)
76111 decoded_href_hash = Addressable ::URI . unescape ( href_hash )
77112 fragment_ids = [ href_hash , decoded_href_hash ]
78113 # https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
79- fragment_ids . include? ( "top" ) || !find_fragments ( fragment_ids , url ) . empty?
114+ return true if fragment_ids . include? ( "top" )
115+
116+ nil
117+ end
118+
119+ private def hash_exists_in_html? ( href_hash , html )
120+ decoded_href_hash = Addressable ::URI . unescape ( href_hash )
121+ fragment_ids = [ href_hash , decoded_href_hash ]
122+ !find_fragments ( fragment_ids , html ) . empty?
80123 end
81124
82- private def find_fragments ( fragment_ids , url )
125+ private def find_fragments ( fragment_ids , html )
83126 xpaths = fragment_ids . uniq . flat_map do |frag_id |
84127 escaped_frag_id = "'#{ frag_id . split ( "'" ) . join ( "', \" '\" , '" ) } ', ''"
85128 [
@@ -89,7 +132,6 @@ def run_internal_link_checker(links)
89132 end
90133 xpaths << XpathFunctions . new
91134
92- html = create_nokogiri ( url . absolute_path )
93135 html . xpath ( *xpaths )
94136 end
95137 end
0 commit comments