From 40ebeb0d064d98614e9f1af2008580519763ed75 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 22:05:08 -0800 Subject: [PATCH 1/6] content [nfc]: Add a bit of dartdoc on parser --- lib/model/content.dart | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/model/content.dart b/lib/model/content.dart index c98a5ceaaf..7abafcd167 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -815,6 +815,9 @@ enum _ParserContext { inline, } +/// Parser for a complete piece of Zulip HTML content, a [ZulipContent]. +/// +/// The only entry point to this class is [parse]. class _ZulipContentParser { /// The current state of what sort of nodes the parser is looking for. /// @@ -1039,6 +1042,8 @@ class _ZulipContentParser { return nodes.map(parseInlineContent).toList(growable: false); } + /// Parse the children of a [BlockInlineContainerNode], making up a + /// complete subtree of inline content with no further inline ancestors. ({List nodes, List? links}) parseBlockInline(List nodes) { assert(_debugParserContext == _ParserContext.block); assert(() { @@ -1660,6 +1665,8 @@ class _ZulipContentParser { } } +/// Parse a complete piece of Zulip HTML content, +/// such as an entire value of [Message.content]. ZulipContent parseContent(String html) { return _ZulipContentParser().parse(html); } From d0260b8dbe805b88d5dca011eb40640a0e618fc8 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 21:30:05 -0800 Subject: [PATCH 2/6] content [nfc]: Split parseMath into parseInlineMath vs parseMathBlock This makes all the uses of _debugParserContext maximally simple, which will help us refactor it away. --- lib/model/content.dart | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/model/content.dart b/lib/model/content.dart index 7abafcd167..82f24beb54 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -825,9 +825,7 @@ class _ZulipContentParser { /// and should be read or updated only inside an assertion. _ParserContext _debugParserContext = _ParserContext.block; - String? parseMath(dom.Element element, {required bool block}) { - assert(block == (_debugParserContext == _ParserContext.block)); - + static String? _parseMath(dom.Element element, {required bool block}) { final dom.Element katexElement; if (!block) { assert(element.localName == 'span' && element.className == 'katex'); @@ -873,6 +871,11 @@ class _ZulipContentParser { return descendant4.text.trim(); } + String? parseInlineMath(dom.Element element) { + assert(_debugParserContext == _ParserContext.inline); + return _parseMath(element, block: false); + } + UserMentionNode? parseUserMention(dom.Element element) { assert(_debugParserContext == _ParserContext.inline); assert(element.localName == 'span'); @@ -1028,7 +1031,7 @@ class _ZulipContentParser { } if (localName == 'span' && className == 'katex') { - final texSource = parseMath(element, block: false); + final texSource = parseInlineMath(element); if (texSource == null) return unimplemented(); return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode); } @@ -1058,6 +1061,11 @@ class _ZulipContentParser { return (nodes: resultNodes, links: _takeLinkNodes()); } + String? parseMathBlock(dom.Element element) { + assert(_debugParserContext == _ParserContext.block); + return _parseMath(element, block: true); + } + BlockContentNode parseListNode(dom.Element element) { assert(_debugParserContext == _ParserContext.block); ListStyle? listStyle; @@ -1485,7 +1493,7 @@ class _ZulipContentParser { // The case with the `
\n` can happen when at the end of a quote; // it seems like a glitch in the server's Markdown processing, // so hopefully there just aren't any further such glitches. - final texSource = parseMath(child, block: true); + final texSource = parseMathBlock(child); if (texSource == null) return UnimplementedBlockContentNode(htmlNode: node); return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode); } From 7d0c0e4a019e5ea3aec2a6a4e7c45717c6f1d161 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 22:08:36 -0800 Subject: [PATCH 3/6] content [nfc]: Move static _parseMath out from parser class Because this is used for parsing both inline and block nodes, moving it out will help us separate the rest of the parser code between the handling of inline content and block content. --- lib/model/content.dart | 92 +++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/lib/model/content.dart b/lib/model/content.dart index 82f24beb54..e21a4e9825 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -806,6 +806,52 @@ class GlobalTimeNode extends InlineContentNode { //////////////////////////////////////////////////////////////// +String? _parseMath(dom.Element element, {required bool block}) { + final dom.Element katexElement; + if (!block) { + assert(element.localName == 'span' && element.className == 'katex'); + + katexElement = element; + } else { + assert(element.localName == 'span' && element.className == 'katex-display'); + + if (element.nodes.length != 1) return null; + final child = element.nodes.single; + if (child is! dom.Element) return null; + if (child.localName != 'span') return null; + if (child.className != 'katex') return null; + katexElement = child; + } + + // Expect two children span.katex-mathml, span.katex-html . + // For now we only care about the .katex-mathml . + if (katexElement.nodes.isEmpty) return null; + final child = katexElement.nodes.first; + if (child is! dom.Element) return null; + if (child.localName != 'span') return null; + if (child.className != 'katex-mathml') return null; + + if (child.nodes.length != 1) return null; + final grandchild = child.nodes.single; + if (grandchild is! dom.Element) return null; + if (grandchild.localName != 'math') return null; + if (grandchild.attributes['display'] != (block ? 'block' : null)) return null; + if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null; + + if (grandchild.nodes.length != 1) return null; + final greatgrand = grandchild.nodes.single; + if (greatgrand is! dom.Element) return null; + if (greatgrand.localName != 'semantics') return null; + + if (greatgrand.nodes.isEmpty) return null; + final descendant4 = greatgrand.nodes.last; + if (descendant4 is! dom.Element) return null; + if (descendant4.localName != 'annotation') return null; + if (descendant4.attributes['encoding'] != 'application/x-tex') return null; + + return descendant4.text.trim(); +} + /// What sort of nodes a [_ZulipContentParser] is currently expecting to find. enum _ParserContext { /// The parser is currently looking for block nodes. @@ -825,52 +871,6 @@ class _ZulipContentParser { /// and should be read or updated only inside an assertion. _ParserContext _debugParserContext = _ParserContext.block; - static String? _parseMath(dom.Element element, {required bool block}) { - final dom.Element katexElement; - if (!block) { - assert(element.localName == 'span' && element.className == 'katex'); - - katexElement = element; - } else { - assert(element.localName == 'span' && element.className == 'katex-display'); - - if (element.nodes.length != 1) return null; - final child = element.nodes.single; - if (child is! dom.Element) return null; - if (child.localName != 'span') return null; - if (child.className != 'katex') return null; - katexElement = child; - } - - // Expect two children span.katex-mathml, span.katex-html . - // For now we only care about the .katex-mathml . - if (katexElement.nodes.isEmpty) return null; - final child = katexElement.nodes.first; - if (child is! dom.Element) return null; - if (child.localName != 'span') return null; - if (child.className != 'katex-mathml') return null; - - if (child.nodes.length != 1) return null; - final grandchild = child.nodes.single; - if (grandchild is! dom.Element) return null; - if (grandchild.localName != 'math') return null; - if (grandchild.attributes['display'] != (block ? 'block' : null)) return null; - if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null; - - if (grandchild.nodes.length != 1) return null; - final greatgrand = grandchild.nodes.single; - if (greatgrand is! dom.Element) return null; - if (greatgrand.localName != 'semantics') return null; - - if (greatgrand.nodes.isEmpty) return null; - final descendant4 = greatgrand.nodes.last; - if (descendant4 is! dom.Element) return null; - if (descendant4.localName != 'annotation') return null; - if (descendant4.attributes['encoding'] != 'application/x-tex') return null; - - return descendant4.text.trim(); - } - String? parseInlineMath(dom.Element element) { assert(_debugParserContext == _ParserContext.inline); return _parseMath(element, block: false); From cb9d18593b33671c2862cb6d4aa46ab6e768101a Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 21:24:55 -0800 Subject: [PATCH 4/6] content [nfc]: Split out _ZulipInlineContentParser class This helps organize the parsing code a bit more cleanly -- not to mention more statically, by making the dynamic _debugParserContext assertions redundant. --- lib/model/content.dart | 68 ++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/lib/model/content.dart b/lib/model/content.dart index e21a4e9825..39b5f549e0 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -786,7 +786,7 @@ class MathInlineNode extends InlineContentNode { class GlobalTimeNode extends InlineContentNode { const GlobalTimeNode({super.debugHtmlNode, required this.datetime}); - /// Always in UTC, enforced in [_ZulipContentParser.parseInlineContent]. + /// Always in UTC, enforced in [_ZulipInlineContentParser.parseInlineContent]. final DateTime datetime; @override @@ -852,32 +852,19 @@ String? _parseMath(dom.Element element, {required bool block}) { return descendant4.text.trim(); } -/// What sort of nodes a [_ZulipContentParser] is currently expecting to find. -enum _ParserContext { - /// The parser is currently looking for block nodes. - block, - - /// The parser is currently looking for inline nodes. - inline, -} - -/// Parser for a complete piece of Zulip HTML content, a [ZulipContent]. +/// Parser for the inline-content subtrees within Zulip content HTML. /// -/// The only entry point to this class is [parse]. -class _ZulipContentParser { - /// The current state of what sort of nodes the parser is looking for. - /// - /// This exists for the sake of debug-mode checks, - /// and should be read or updated only inside an assertion. - _ParserContext _debugParserContext = _ParserContext.block; - +/// The only entry point to this class is [parseBlockInline]. +/// +/// After a call to [parseBlockInline] returns, the [_ZulipInlineContentParser] +/// instance has been reset to its starting state, and can be re-used for +/// parsing other subtrees. +class _ZulipInlineContentParser { String? parseInlineMath(dom.Element element) { - assert(_debugParserContext == _ParserContext.inline); return _parseMath(element, block: false); } UserMentionNode? parseUserMention(dom.Element element) { - assert(_debugParserContext == _ParserContext.inline); assert(element.localName == 'span'); final debugHtmlNode = kDebugMode ? element : null; @@ -951,7 +938,6 @@ class _ZulipContentParser { static final _emojiCodeFromClassNameRegexp = RegExp(r"emoji-([^ ]+)"); InlineContentNode parseInlineContent(dom.Node node) { - assert(_debugParserContext == _ParserContext.inline); final debugHtmlNode = kDebugMode ? node : null; InlineContentNode unimplemented() => UnimplementedInlineContentNode(htmlNode: node); @@ -1041,33 +1027,38 @@ class _ZulipContentParser { } List parseInlineContentList(List nodes) { - assert(_debugParserContext == _ParserContext.inline); return nodes.map(parseInlineContent).toList(growable: false); } /// Parse the children of a [BlockInlineContainerNode], making up a /// complete subtree of inline content with no further inline ancestors. ({List nodes, List? links}) parseBlockInline(List nodes) { - assert(_debugParserContext == _ParserContext.block); - assert(() { - _debugParserContext = _ParserContext.inline; - return true; - }()); final resultNodes = parseInlineContentList(nodes); - assert(() { - _debugParserContext = _ParserContext.block; - return true; - }()); return (nodes: resultNodes, links: _takeLinkNodes()); } +} + +/// Parser for a complete piece of Zulip HTML content, a [ZulipContent]. +/// +/// The only entry point to this class is [parse]. +class _ZulipContentParser { + /// The single inline-content parser used and re-used throughout parsing of + /// a complete piece of Zulip HTML content. + /// + /// Because block content can never appear nested inside inline content, + /// there's never a need for more than one of these at a time, + /// so we can allocate just one up front. + final inlineParser = _ZulipInlineContentParser(); + + ({List nodes, List? links}) parseBlockInline(List nodes) { + return inlineParser.parseBlockInline(nodes); + } String? parseMathBlock(dom.Element element) { - assert(_debugParserContext == _ParserContext.block); return _parseMath(element, block: true); } BlockContentNode parseListNode(dom.Element element) { - assert(_debugParserContext == _ParserContext.block); ListStyle? listStyle; switch (element.localName) { case 'ol': listStyle = ListStyle.ordered; break; @@ -1090,7 +1081,6 @@ class _ZulipContentParser { } BlockContentNode parseSpoilerNode(dom.Element divElement) { - assert(_debugParserContext == _ParserContext.block); assert(divElement.localName == 'div' && divElement.className == 'spoiler-block'); @@ -1110,7 +1100,6 @@ class _ZulipContentParser { } BlockContentNode parseCodeBlock(dom.Element divElement) { - assert(_debugParserContext == _ParserContext.block); final mainElement = () { assert(divElement.localName == 'div' && divElement.className == "codehilite"); @@ -1193,7 +1182,6 @@ class _ZulipContentParser { static final _imageDimensionsRegExp = RegExp(r'^(\d+)x(\d+)$'); BlockContentNode parseImageNode(dom.Element divElement) { - assert(_debugParserContext == _ParserContext.block); final elements = () { assert(divElement.localName == 'div' && divElement.className == 'message_inline_image'); @@ -1285,7 +1273,6 @@ class _ZulipContentParser { }(); BlockContentNode parseInlineVideoNode(dom.Element divElement) { - assert(_debugParserContext == _ParserContext.block); assert(divElement.localName == 'div' && _videoClassNameRegexp.hasMatch(divElement.className)); @@ -1318,7 +1305,6 @@ class _ZulipContentParser { } BlockContentNode parseEmbedVideoNode(dom.Element divElement) { - assert(_debugParserContext == _ParserContext.block); assert(divElement.localName == 'div' && _videoClassNameRegexp.hasMatch(divElement.className)); @@ -1357,7 +1343,6 @@ class _ZulipContentParser { } BlockContentNode parseTableContent(dom.Element tableElement) { - assert(_debugParserContext == _ParserContext.block); assert(tableElement.localName == 'table' && tableElement.className.isEmpty); @@ -1465,7 +1450,6 @@ class _ZulipContentParser { } BlockContentNode parseBlockContent(dom.Node node) { - assert(_debugParserContext == _ParserContext.block); final debugHtmlNode = kDebugMode ? node : null; if (node is! dom.Element) { return UnimplementedBlockContentNode(htmlNode: node); @@ -1592,7 +1576,6 @@ class _ZulipContentParser { /// /// See [ParagraphNode]. List parseImplicitParagraphBlockContentList(dom.NodeList nodes) { - assert(_debugParserContext == _ParserContext.block); final List result = []; final List currentParagraph = []; List imageNodes = []; @@ -1641,7 +1624,6 @@ class _ZulipContentParser { static final _redundantLineBreaksRegexp = RegExp(r'^\n+$'); List parseBlockContentList(dom.NodeList nodes) { - assert(_debugParserContext == _ParserContext.block); final List result = []; List imageNodes = []; for (final node in nodes) { From 612c91eb6f9adcecbe16d8774c287d98b7983e20 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 22:56:01 -0800 Subject: [PATCH 5/6] content [nfc]: Encapsulate parsing math nodes a bit more --- lib/model/content.dart | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/model/content.dart b/lib/model/content.dart index 39b5f549e0..2ea790796b 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -860,8 +860,11 @@ String? _parseMath(dom.Element element, {required bool block}) { /// instance has been reset to its starting state, and can be re-used for /// parsing other subtrees. class _ZulipInlineContentParser { - String? parseInlineMath(dom.Element element) { - return _parseMath(element, block: false); + InlineContentNode? parseInlineMath(dom.Element element) { + final debugHtmlNode = kDebugMode ? element : null; + final texSource = _parseMath(element, block: false); + if (texSource == null) return null; + return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode); } UserMentionNode? parseUserMention(dom.Element element) { @@ -1017,9 +1020,7 @@ class _ZulipInlineContentParser { } if (localName == 'span' && className == 'katex') { - final texSource = parseInlineMath(element); - if (texSource == null) return unimplemented(); - return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode); + return parseInlineMath(element) ?? unimplemented(); } // TODO more types of node @@ -1054,8 +1055,11 @@ class _ZulipContentParser { return inlineParser.parseBlockInline(nodes); } - String? parseMathBlock(dom.Element element) { - return _parseMath(element, block: true); + BlockContentNode parseMathBlock(dom.Element element) { + final debugHtmlNode = kDebugMode ? element : null; + final texSource = _parseMath(element, block: true); + if (texSource == null) return UnimplementedBlockContentNode(htmlNode: element); + return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode); } BlockContentNode parseListNode(dom.Element element) { @@ -1477,9 +1481,7 @@ class _ZulipContentParser { // The case with the `
\n` can happen when at the end of a quote; // it seems like a glitch in the server's Markdown processing, // so hopefully there just aren't any further such glitches. - final texSource = parseMathBlock(child); - if (texSource == null) return UnimplementedBlockContentNode(htmlNode: node); - return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode); + return parseMathBlock(child); } } } From 35d83f2192b3b6ae41b89d9ed2367e738e9ff12c Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 9 Jan 2025 23:12:16 -0800 Subject: [PATCH 6/6] content [nfc]: Factor out consumeImageNodes in block-content parse loops This hopefully makes the logic of these loops a bit more readable. --- lib/model/content.dart | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/lib/model/content.dart b/lib/model/content.dart index 2ea790796b..e228163a2e 100644 --- a/lib/model/content.dart +++ b/lib/model/content.dart @@ -1579,8 +1579,14 @@ class _ZulipContentParser { /// See [ParagraphNode]. List parseImplicitParagraphBlockContentList(dom.NodeList nodes) { final List result = []; - final List currentParagraph = []; + List imageNodes = []; + void consumeImageNodes() { + result.add(ImageNodeList(imageNodes)); + imageNodes = []; + } + + final List currentParagraph = []; void consumeParagraph() { final parsed = parseBlockInline(currentParagraph); result.add(ParagraphNode( @@ -1595,8 +1601,7 @@ class _ZulipContentParser { if (_isPossibleInlineNode(node)) { if (imageNodes.isNotEmpty) { - result.add(ImageNodeList(imageNodes)); - imageNodes = []; + consumeImageNodes(); // In a context where paragraphs are implicit it should be impossible // to have more paragraph content after image previews. result.add(UnimplementedBlockContentNode(htmlNode: node)); @@ -1611,15 +1616,11 @@ class _ZulipContentParser { imageNodes.add(block); continue; } - if (imageNodes.isNotEmpty) { - result.add(ImageNodeList(imageNodes)); - imageNodes = []; - } + if (imageNodes.isNotEmpty) consumeImageNodes(); result.add(block); } if (currentParagraph.isNotEmpty) consumeParagraph(); - if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes)); - + if (imageNodes.isNotEmpty) consumeImageNodes(); return result; } @@ -1627,7 +1628,13 @@ class _ZulipContentParser { List parseBlockContentList(dom.NodeList nodes) { final List result = []; + List imageNodes = []; + void consumeImageNodes() { + result.add(ImageNodeList(imageNodes)); + imageNodes = []; + } + for (final node in nodes) { // We get a bunch of newline Text nodes between paragraphs. // A browser seems to ignore these; let's do the same. @@ -1640,13 +1647,10 @@ class _ZulipContentParser { imageNodes.add(block); continue; } - if (imageNodes.isNotEmpty) { - result.add(ImageNodeList(imageNodes)); - imageNodes = []; - } + if (imageNodes.isNotEmpty) consumeImageNodes(); result.add(block); } - if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes)); + if (imageNodes.isNotEmpty) consumeImageNodes(); return result; }