From fb8eece1ae9a6df62dc5f92e06e16d00a0bffb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mislav=20Marohni=C4=87?= Date: Mon, 16 Jun 2025 23:19:19 +0200 Subject: [PATCH] Optimize traversing the DOM when analyzing text content Previously, this was an often-repeated construct in readability implementation: charCount(ps.getInnerText(node, true)) == 0 What this would do is: - Call `dom.TextContent(node)` to append the contents of all individual text nodes together; - Pass the result through `strings.TrimSpace`; - Pass the result through the NormalizeSpaces regex which squashes consecutive runs of whitespace; - Count the Unicode runes of the result; - Finally, if the count is zero, the element would be considered "empty". The above is an example of an incredibly costly operation that could be done much more efficiently, for example: walk the DOM subtree until the first non-space character is found, then bail out and conclude that the element has content. This barely needs any memory allocations, and is the approach taken in this PR to address a variety of counting or detecting tasks that share a similar purpose. Benchmark before vs. after for processing a large HTML document reveals significant saving in memory allocations: variant | times | ns/op | Bytes/op | allocs/op --------|-------|------------|------------|---------- before | 30 | 38,986,203 | 59,623,683 | 199,876 after | 36 | 31,910,769 | 11,449,004 | 119,810 --- internal/re2go/grab-article.go | 137 +--------------- internal/re2go/grab-article.re | 22 --- internal/re2go/re2go_test.go | 5 - parser.go | 292 ++++++++++++++++++++------------- parser_test.go | 54 ++++++ traverse.go | 93 +++++++++++ utils.go | 35 +++- 7 files changed, 353 insertions(+), 285 deletions(-) create mode 100644 traverse.go diff --git a/internal/re2go/grab-article.go b/internal/re2go/grab-article.go index 52924f9..8b1db0c 100644 --- a/internal/re2go/grab-article.go +++ b/internal/re2go/grab-article.go @@ -1,4 +1,4 @@ -// Code generated by re2go 4.0.2, DO NOT EDIT. +// Code generated by re2go 4.2, DO NOT EDIT. package re2go // Original pattern: (?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote @@ -2024,138 +2024,3 @@ func MaybeItsACandidate(input string) bool { } } - -// Commas as used in Latin, Sindhi, Chinese and various other scripts. -// see: https://en.wikipedia.org/wiki/Comma#Comma_variants -// Original pattern: \u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C -func CountCommas(input string) int { - var count int - var cursor, marker int - input += string(rune(0)) // add terminating null - limit := len(input) - 1 // limit points at the terminating null - _ = marker - - for { - { - var yych byte - yych = input[cursor] - switch yych { - case ',': - goto yy177 - case 0xD8: - goto yy178 - case 0xE2: - goto yy179 - case 0xEF: - goto yy180 - default: - if limit <= cursor { - goto yy187 - } - goto yy175 - } - yy175: - cursor++ - yy176: - { - continue - } - yy177: - cursor++ - { - count++ - continue - } - yy178: - cursor++ - yych = input[cursor] - switch yych { - case 0x8C: - goto yy177 - default: - goto yy176 - } - yy179: - cursor++ - marker = cursor - yych = input[cursor] - switch yych { - case 0xB8: - goto yy181 - case 0xB9: - goto yy183 - default: - goto yy176 - } - yy180: - cursor++ - marker = cursor - yych = input[cursor] - switch yych { - case 0xB8: - goto yy184 - case 0xB9: - goto yy185 - case 0xBC: - goto yy186 - default: - goto yy176 - } - yy181: - cursor++ - yych = input[cursor] - switch yych { - case 0xB2: - fallthrough - case 0xB4: - goto yy177 - default: - goto yy182 - } - yy182: - cursor = marker - goto yy176 - yy183: - cursor++ - yych = input[cursor] - switch yych { - case 0x81: - goto yy177 - default: - goto yy182 - } - yy184: - cursor++ - yych = input[cursor] - switch yych { - case 0x90, 0x91: - goto yy177 - default: - goto yy182 - } - yy185: - cursor++ - yych = input[cursor] - switch yych { - case 0x90: - goto yy177 - default: - goto yy182 - } - yy186: - cursor++ - yych = input[cursor] - switch yych { - case 0x8C: - goto yy177 - default: - goto yy182 - } - yy187: - { - return count - } - } - - } -} diff --git a/internal/re2go/grab-article.re b/internal/re2go/grab-article.re index c9f861f..81e0cd8 100644 --- a/internal/re2go/grab-article.re +++ b/internal/re2go/grab-article.re @@ -37,25 +37,3 @@ func MaybeItsACandidate(input string) bool { */ } } - -// Commas as used in Latin, Sindhi, Chinese and various other scripts. -// see: https://en.wikipedia.org/wiki/Comma#Comma_variants -// Original pattern: \u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C -func CountCommas(input string) int { - var count int - var cursor, marker int - input += string(rune(0)) // add terminating null - limit := len(input) - 1 // limit points at the terminating null - _ = marker - - for { /*!use:re2c:base_template - re2c:case-insensitive = 1; - - commas = [\u002C\u060C\uFE50\uFE10\uFE11\u2E41\u2E34\u2E32\uFF0C]; - - {commas} { count++; continue } - * { continue } - $ { return count } - */ - } -} \ No newline at end of file diff --git a/internal/re2go/re2go_test.go b/internal/re2go/re2go_test.go index 1ebe32e..99abb5d 100644 --- a/internal/re2go/re2go_test.go +++ b/internal/re2go/re2go_test.go @@ -152,11 +152,6 @@ func Test_MaybeItsACandidate(t *testing.T) { assert.False(t, MaybeItsACandidate(`

Paragraph text

`)) } -func Test_CountCommas(t *testing.T) { - assert.Equal(t, 3, CountCommas("my,name,is,john")) - assert.Equal(t, 9, CountCommas("now,its،a mixed﹐commas︐from︑various⹁place⸴and⸲country,")) -} - func Test_NormalizeSpaces(t *testing.T) { assert.Equal(t, "some sentence", NormalizeSpaces("some sentence")) assert.Equal(t, "with tabs", NormalizeSpaces("with \t \ttabs")) diff --git a/parser.go b/parser.go index 6e9ad45..8c704bb 100644 --- a/parser.go +++ b/parser.go @@ -23,9 +23,7 @@ import ( var ( rxVideos = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`) rxTokenize = regexp.MustCompile(`(?i)\W+`) - rxWhitespace = regexp.MustCompile(`(?i)^\s*$`) rxHasContent = regexp.MustCompile(`(?i)\S$`) - rxHashURL = regexp.MustCompile(`(?i)^#.+`) rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|article|twitter)\s*:\s*(author|creator|description|title|site_name|published_time|modified_time|image\S*)\s*`) rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|article|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|published_time|modified_time|image)\s*$`) rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `) @@ -405,8 +403,7 @@ func (ps *Parser) getArticleTitle() string { } } - curTitle = strings.TrimSpace(curTitle) - curTitle = re2go.NormalizeSpaces(curTitle) + curTitle = normalizeWhitespace(curTitle) // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use @@ -448,7 +445,7 @@ func (ps *Parser) prepDocument() { // same node is returned. func (ps *Parser) nextNode(node *html.Node) *html.Node { next := node - for next != nil && next.Type != html.ElementNode && rxWhitespace.MatchString(dom.TextContent(next)) { + for next != nil && next.Type != html.ElementNode && !hasTextContent(next) { next = next.NextSibling } return next @@ -589,7 +586,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) { iframeCount := len(dom.GetElementsByTagName(p, "iframe")) totalCount := imgCount + embedCount + objectCount + iframeCount - return totalCount == 0 && ps.getInnerText(p, false) == "" + return totalCount == 0 && !hasTextContent(p) }) ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { @@ -714,31 +711,19 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool { rel := dom.GetAttribute(node, "rel") itemprop := dom.GetAttribute(node, "itemprop") - nodeText := dom.TextContent(node) - if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) && - ps.isValidByline(nodeText) { - nodeText = strings.TrimSpace(nodeText) - nodeText = strings.Join(strings.Fields(nodeText), " ") - ps.articleByline = nodeText - return true + if rel != "author" && !strings.Contains(itemprop, "author") && !re2go.IsByline(matchString) { + return false } - return false -} - -func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 { - textLength := charCount(ps.getInnerText(node, true)) - if textLength == 0 { - return 0 + nodeText := ps.getInnerText(node, false) + // For now, it's intentional that counting characters happens before + // whitespace normalization. Doing it the other way around breaks several + // tests and the bylines end up different. + if nChar := charCount(nodeText); nChar > 0 && nChar < 100 { + ps.articleByline = normalizeWhitespace(nodeText) + return true } - - var childrenLength int - children := ps.getAllNodesWithTag(node, tags...) - ps.forEachNode(children, func(child *html.Node, _ int) { - childrenLength += charCount(ps.getInnerText(child, true)) - }) - - return float64(childrenLength) / float64(textLength) + return false } // getNodeAncestors gets the node's direct parent and grandparents. @@ -816,7 +801,7 @@ func (ps *Parser) grabArticle() *html.Node { if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) { ps.logf("removing header: %q duplicate of %q\n", - trim(dom.TextContent(node)), trim(ps.articleTitle)) + ps.getInnerText(node, true), normalizeWhitespace(ps.articleTitle)) shouldRemoveTitleHeader = false node = ps.removeAndGetNext(node) continue @@ -911,9 +896,9 @@ func (ps *Parser) grabArticle() *html.Node { return } + numChars, numCommas := countCharsAndCommas(elementToScore) // If this paragraph is less than 25 characters, don't even count it. - innerText := ps.getInnerText(elementToScore, true) - if charCount(innerText) < 25 { + if numChars < 25 { return } @@ -927,10 +912,10 @@ func (ps *Parser) grabArticle() *html.Node { contentScore := 1 // Add points for any commas within this paragraph. - contentScore += re2go.CountCommas(innerText) + contentScore += numCommas // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0)) + contentScore += int(math.Min(math.Floor(float64(numChars)/100.0), 3.0)) // Initialize and score ancestors. ps.forEachNode(ancestors, func(ancestor *html.Node, level int) { @@ -1130,6 +1115,7 @@ func (ps *Parser) grabArticle() *html.Node { appendNode = true } else if dom.TagName(sibling) == "p" { linkDensity := ps.getLinkDensity(sibling) + // FIXME: avoid gathering nodeContent just to detect whether there was a sentence period nodeContent := ps.getInnerText(sibling, true) nodeLength := charCount(nodeContent) @@ -1199,7 +1185,7 @@ func (ps *Parser) grabArticle() *html.Node { // gives us a higher likelihood of finding the content, and // the sieve approach gives us a higher likelihood of // finding the -right- content. - textLength := charCount(ps.getInnerText(articleContent, true)) + textLength, _ := countCharsAndCommas(articleContent) if textLength < ps.CharThresholds { parseSuccessful = false @@ -1249,15 +1235,6 @@ func (ps *Parser) grabArticle() *html.Node { } } -// isValidByline checks whether the input string could be a byline. -// This verifies that the input is a string, and that the length -// is less than 100 chars. -func (ps *Parser) isValidByline(byline string) bool { - byline = strings.TrimSpace(byline) - nChar := charCount(byline) - return nChar > 0 && nChar < 100 -} - // getJSONLD try to extract metadata from JSON-LD object. // For now, only Schema.org objects of type Article or its subtypes are supported. func (ps *Parser) getJSONLD() (map[string]string, error) { @@ -1515,8 +1492,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool { } children := dom.Children(node) - textContent := dom.TextContent(node) - if len(children) != 1 || strings.TrimSpace(textContent) != "" { + if len(children) != 1 || hasTextContent(node) { return false } @@ -1624,15 +1600,23 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool } // isElementWithoutContent determines if node is empty -// or only fille with
and
. +// or only filled with
and
. func (ps *Parser) isElementWithoutContent(node *html.Node) bool { - brs := dom.GetElementsByTagName(node, "br") - hrs := dom.GetElementsByTagName(node, "hr") - childs := dom.Children(node) - - return node.Type == html.ElementNode && - strings.TrimSpace(dom.TextContent(node)) == "" && - (len(childs) == 0 || len(childs) == len(brs)+len(hrs)) + if node.Type != html.ElementNode { + return false + } + // Traverse the node's descendants to find any text content that is + // non-whitespace or any elements other than
and
. + for child := range node.ChildNodes() { + if child.Type == html.TextNode { + if hasContent(child.Data) { + return false + } + } else if child.Type == html.ElementNode && child.Data != "br" && child.Data != "hr" { + return false + } + } + return true } // hasChildBlockElement determines whether element has any children @@ -1654,26 +1638,18 @@ func (ps *Parser) isPhrasingContent(node *html.Node) bool { // isWhitespace determines if a node only used as whitespace. func (ps *Parser) isWhitespace(node *html.Node) bool { - return (node.Type == html.TextNode && strings.TrimSpace(dom.TextContent(node)) == "") || + return (node.Type == html.TextNode && !hasTextContent(node)) || (node.Type == html.ElementNode && dom.TagName(node) == "br") } -// getInnerText gets the inner text of a node. -// This also strips * out any excess whitespace to be found. -// In Readability.js, normalizeSpaces default to true. +// getInnerText gets the inner text of a node. This also strips out any excess +// whitespace to be found. In Readability.js, normalizeSpaces defaults to true. func (ps *Parser) getInnerText(node *html.Node, normalizeSpaces bool) string { - textContent := strings.TrimSpace(dom.TextContent(node)) + textContent := dom.TextContent(node) if normalizeSpaces { - textContent = re2go.NormalizeSpaces(textContent) + return normalizeWhitespace(textContent) } - return textContent -} - -// getCharCount returns the number of times a string s -// appears in the node. -func (ps *Parser) getCharCount(node *html.Node, s string) int { - innerText := ps.getInnerText(node, true) - return strings.Count(innerText, s) + return strings.TrimSpace(textContent) } // cleanStyles removes the style attribute on every node and under. @@ -1702,26 +1678,48 @@ func (ps *Parser) cleanStyles(node *html.Node) { // content. This is the amount of text that is inside a link divided // by the total text in the node. func (ps *Parser) getLinkDensity(element *html.Node) float64 { - textLength := charCount(ps.getInnerText(element, true)) - if textLength == 0 { - return 0 - } - - var linkLength float64 - ps.forEachNode(dom.GetElementsByTagName(element, "a"), func(linkNode *html.Node, _ int) { - href := dom.GetAttribute(linkNode, "href") - href = strings.TrimSpace(href) - - coefficient := 1.0 - if href != "" && rxHashURL.MatchString(href) { - coefficient = 0.3 + chars := &charCounter{} + var linkCharsWeighted float64 + + var walk func(*html.Node, runeCounter) + walk = func(n *html.Node, linkCounter runeCounter) { + if n.Type == html.TextNode { + for _, r := range n.Data { + chars.Count(r) + if linkCounter != nil { + linkCounter.Count(r) + } + } + return } + if n.Type == html.ElementNode && n.Data == "a" { + cc := &charCounter{} + linkCoefficient := getLinkDensityCoefficient(n) + defer func() { + linkCharsWeighted += float64(cc.Total) * linkCoefficient + }() + linkCounter = cc + } + for child := range n.ChildNodes() { + walk(child, linkCounter) + } + } + walk(element, nil) - nodeLength := charCount(ps.getInnerText(linkNode, true)) - linkLength += float64(nodeLength) * coefficient - }) + if chars.Total == 0 { + return 0 + } + return linkCharsWeighted / float64(chars.Total) +} - return linkLength / float64(textLength) +// getLinkDensityCoefficient ensures that the text contents of links is scored lower for links +// that point to sections within the same document. +func getLinkDensityCoefficient(a *html.Node) float64 { + href := strings.TrimSpace(dom.GetAttribute(a, "href")) + if len(href) > 1 && href[0] == '#' { + return 0.3 + } + return 1.0 } // getClassWeight gets an elements class/id weight. Uses regular @@ -2014,18 +2012,6 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) { return false } - isList := tag == "ul" || tag == "ol" - if !isList { - var listLength int - listNodes := ps.getAllNodesWithTag(node, "ul", "ol") - ps.forEachNode(listNodes, func(list *html.Node, _ int) { - listLength += charCount(ps.getInnerText(list, true)) - }) - - nodeLength := charCount(ps.getInnerText(node, true)) - isList = float64(listLength)/float64(nodeLength) > 0.9 - } - // Next check if we're inside a data table, in which case don't remove it as well. if ps.hasAncestorTag(node, "table", -1, ps.isReadabilityDataTable) { return false @@ -2035,25 +2021,89 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) { return false } + // NOTE: Readability.js also uses a contentScore of 0 here var contentScore int weight := ps.getClassWeight(node) if weight+contentScore < 0 { return true } - if ps.getCharCount(node, ",") < 10 { - // If there are not very many commas, and the number of - // non-paragraph elements is more than paragraphs or other - // ominous signs, remove the element. - p := float64(len(dom.GetElementsByTagName(node, "p"))) - img := float64(len(dom.GetElementsByTagName(node, "img"))) - li := float64(len(dom.GetElementsByTagName(node, "li")) - 100) - input := float64(len(dom.GetElementsByTagName(node, "input"))) - headingDensity := ps.getTextDensity(node, "h1", "h2", "h3", "h4", "h5", "h6") + chars := &charCounter{} + listChars := &charCounter{} + var linkCharsWeighted float64 + headingChars := &charCounter{} + commas := &commaCounter{} + pCount := 0 + imgCount := 0 + liCount := 0 + inputCount := 0 + embedCount := 0 + + // Walk the DOM under this node to determine element counts and various types of content + // densities. Most notably, this scans for: + // - overall character count + // - character count of text under