Skip to content

Commit fc5eab8

Browse files
authored
Merge pull request #770 from riccardoporreca/feature/768-optimize-internal-hash-checks
Optimize checking internal link hashes in target files
2 parents 026549f + e156faa commit fc5eab8

File tree

3 files changed

+56
-14
lines changed

3 files changed

+56
-14
lines changed

Rakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ task :proof_readme do
2424
require "redcarpet"
2525

2626
renderer = Redcarpet::Render::HTML.new(\
27-
with_toc_data: true
27+
with_toc_data: true,
2828
)
2929
redcarpet = Redcarpet::Markdown.new(renderer)
3030
html = redcarpet.render(File.read("README.md"))

lib/html_proofer.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
lib_dir = File.join(File.dirname(__dir__), "lib")
55
gem_loader = Zeitwerk::Loader.for_gem
66
gem_loader.inflector.inflect(
7-
"html_proofer" => "HTMLProofer"
7+
"html_proofer" => "HTMLProofer",
88
)
99
gem_loader.ignore(File.join(lib_dir, "html-proofer.rb"))
1010
gem_loader.setup

lib/html_proofer/url_validator/internal.rb

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,39 @@ def validate
2222
end
2323

2424
def run_internal_link_checker(links)
25+
# collect urls and metadata for hashes to be checked in the same target file
26+
file_paths_hashes_to_check = {}
2527
to_add = []
26-
links.each_pair do |link, matched_files|
28+
links.each_with_index do |(link, matched_files), i|
29+
matched_count_to_log = pluralize(matched_files.count, "reference", "references")
30+
@logger.log(:debug, "(#{i + 1} / #{links.count}) Internal link #{link}: Checking #{matched_count_to_log}")
2731
matched_files.each do |metadata|
2832
url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url])
2933

3034
@runner.current_source = metadata[:source]
3135
@runner.current_filename = metadata[:filename]
3236

33-
unless file_exists?(url)
37+
target_file_path = url.absolute_path
38+
unless file_exists?(target_file_path)
3439
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
3540
"internally linking to #{url}, which does not exist", line: metadata[:line], status: nil, content: nil)
3641
to_add << [url, metadata, false]
3742
next
3843
end
3944

40-
unless hash_exists?(url)
45+
hash_exists = hash_exists_for_url?(url)
46+
if hash_exists.nil?
47+
# the hash needs to be checked in the target file, we collect the url and metadata
48+
unless file_paths_hashes_to_check.key?(target_file_path)
49+
file_paths_hashes_to_check[target_file_path] = {}
50+
end
51+
unless file_paths_hashes_to_check[target_file_path].key?(url.hash)
52+
file_paths_hashes_to_check[target_file_path][url.hash] = []
53+
end
54+
file_paths_hashes_to_check[target_file_path][url.hash] << [url, metadata]
55+
next
56+
end
57+
unless hash_exists
4158
@failed_checks << Failure.new(@runner.current_filename, "Links > Internal",
4259
"internally linking to #{url}; the file exists, but the hash '#{url.hash}' does not", line: metadata[:line], status: nil, content: nil)
4360
to_add << [url, metadata, false]
@@ -48,6 +65,24 @@ def run_internal_link_checker(links)
4865
end
4966
end
5067

68+
# check hashes by target file
69+
@logger.log(:info, "Checking internal link hashes in #{pluralize(file_paths_hashes_to_check.count, "file", "files")}")
70+
file_paths_hashes_to_check.each_with_index do |(file_path, hashes_to_check), i|
71+
hash_count_to_log = pluralize(hashes_to_check.count, "hash", "hashes")
72+
@logger.log(:debug, "(#{i + 1} / #{file_paths_hashes_to_check.count}) Checking #{hash_count_to_log} in #{file_path}")
73+
html = create_nokogiri(file_path)
74+
hashes_to_check.each_pair do |href_hash, url_metadata|
75+
exists = hash_exists_in_html?(href_hash, html)
76+
url_metadata.each do |(url, metadata)|
77+
unless exists
78+
@failed_checks << Failure.new(metadata[:filename], "Links > Internal",
79+
"internally linking to #{url}; the file exists, but the hash '#{href_hash}' does not", line: metadata[:line], status: nil, content: nil)
80+
end
81+
to_add << [url, metadata, exists]
82+
end
83+
end
84+
end
85+
5186
# adding directly to the cache above results in an endless loop
5287
to_add.each do |(url, metadata, exists)|
5388
@cache.add_internal(url.to_s, metadata, exists)
@@ -56,15 +91,15 @@ def run_internal_link_checker(links)
5691
@failed_checks
5792
end
5893

59-
private def file_exists?(url)
60-
absolute_path = url.absolute_path
61-
return @runner.checked_paths[url.absolute_path] if @runner.checked_paths.key?(absolute_path)
94+
private def file_exists?(absolute_path)
95+
return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path)
6296

63-
@runner.checked_paths[url.absolute_path] = File.exist?(absolute_path)
97+
@runner.checked_paths[absolute_path] = File.exist?(absolute_path)
6498
end
6599

66-
# verify the target hash
67-
private def hash_exists?(url)
100+
# verify the hash w/o just based on the URL, w/o looking at the target file
101+
# => returns nil if the has could not be verified
102+
private def hash_exists_for_url?(url)
68103
href_hash = url.hash
69104
return true if blank?(href_hash)
70105
return true unless @runner.options[:check_internal_hash]
@@ -76,10 +111,18 @@ def run_internal_link_checker(links)
76111
decoded_href_hash = Addressable::URI.unescape(href_hash)
77112
fragment_ids = [href_hash, decoded_href_hash]
78113
# https://www.w3.org/TR/html5/single-page.html#scroll-to-fragid
79-
fragment_ids.include?("top") || !find_fragments(fragment_ids, url).empty?
114+
return true if fragment_ids.include?("top")
115+
116+
nil
117+
end
118+
119+
private def hash_exists_in_html?(href_hash, html)
120+
decoded_href_hash = Addressable::URI.unescape(href_hash)
121+
fragment_ids = [href_hash, decoded_href_hash]
122+
!find_fragments(fragment_ids, html).empty?
80123
end
81124

82-
private def find_fragments(fragment_ids, url)
125+
private def find_fragments(fragment_ids, html)
83126
xpaths = fragment_ids.uniq.flat_map do |frag_id|
84127
escaped_frag_id = "'#{frag_id.split("'").join("', \"'\", '")}', ''"
85128
[
@@ -89,7 +132,6 @@ def run_internal_link_checker(links)
89132
end
90133
xpaths << XpathFunctions.new
91134

92-
html = create_nokogiri(url.absolute_path)
93135
html.xpath(*xpaths)
94136
end
95137
end

0 commit comments

Comments
 (0)