From 40ebeb0d064d98614e9f1af2008580519763ed75 Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 22:05:08 -0800
Subject: [PATCH 1/6] content [nfc]: Add a bit of dartdoc on parser

---
 lib/model/content.dart | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/lib/model/content.dart b/lib/model/content.dart
index c98a5ceaaf..7abafcd167 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -815,6 +815,9 @@ enum _ParserContext {
   inline,
 }
 
+/// Parser for a complete piece of Zulip HTML content, a [ZulipContent].
+///
+/// The only entry point to this class is [parse].
 class _ZulipContentParser {
   /// The current state of what sort of nodes the parser is looking for.
   ///
@@ -1039,6 +1042,8 @@ class _ZulipContentParser {
     return nodes.map(parseInlineContent).toList(growable: false);
   }
 
+  /// Parse the children of a [BlockInlineContainerNode], making up a
+  /// complete subtree of inline content with no further inline ancestors.
   ({List<InlineContentNode> nodes, List<LinkNode>? links}) parseBlockInline(List<dom.Node> nodes) {
     assert(_debugParserContext == _ParserContext.block);
     assert(() {
@@ -1660,6 +1665,8 @@ class _ZulipContentParser {
   }
 }
 
+/// Parse a complete piece of Zulip HTML content,
+/// such as an entire value of [Message.content].
 ZulipContent parseContent(String html) {
   return _ZulipContentParser().parse(html);
 }

From d0260b8dbe805b88d5dca011eb40640a0e618fc8 Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 21:30:05 -0800
Subject: [PATCH 2/6] content [nfc]: Split parseMath into parseInlineMath vs
 parseMathBlock

This makes all the uses of _debugParserContext maximally simple,
which will help us refactor it away.
---
 lib/model/content.dart | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/lib/model/content.dart b/lib/model/content.dart
index 7abafcd167..82f24beb54 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -825,9 +825,7 @@ class _ZulipContentParser {
   /// and should be read or updated only inside an assertion.
   _ParserContext _debugParserContext = _ParserContext.block;
 
-  String? parseMath(dom.Element element, {required bool block}) {
-    assert(block == (_debugParserContext == _ParserContext.block));
-
+  static String? _parseMath(dom.Element element, {required bool block}) {
     final dom.Element katexElement;
     if (!block) {
       assert(element.localName == 'span' && element.className == 'katex');
@@ -873,6 +871,11 @@ class _ZulipContentParser {
     return descendant4.text.trim();
   }
 
+  String? parseInlineMath(dom.Element element) {
+    assert(_debugParserContext == _ParserContext.inline);
+    return _parseMath(element, block: false);
+  }
+
   UserMentionNode? parseUserMention(dom.Element element) {
     assert(_debugParserContext == _ParserContext.inline);
     assert(element.localName == 'span');
@@ -1028,7 +1031,7 @@ class _ZulipContentParser {
     }
 
     if (localName == 'span' && className == 'katex') {
-      final texSource = parseMath(element, block: false);
+      final texSource = parseInlineMath(element);
       if (texSource == null) return unimplemented();
       return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
     }
@@ -1058,6 +1061,11 @@ class _ZulipContentParser {
     return (nodes: resultNodes, links: _takeLinkNodes());
   }
 
+  String? parseMathBlock(dom.Element element) {
+    assert(_debugParserContext == _ParserContext.block);
+    return _parseMath(element, block: true);
+  }
+
   BlockContentNode parseListNode(dom.Element element) {
     assert(_debugParserContext == _ParserContext.block);
     ListStyle? listStyle;
@@ -1485,7 +1493,7 @@ class _ZulipContentParser {
             // The case with the `<br>\n` can happen when at the end of a quote;
             // it seems like a glitch in the server's Markdown processing,
             // so hopefully there just aren't any further such glitches.
-            final texSource = parseMath(child, block: true);
+            final texSource = parseMathBlock(child);
             if (texSource == null) return UnimplementedBlockContentNode(htmlNode: node);
             return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
           }

From 7d0c0e4a019e5ea3aec2a6a4e7c45717c6f1d161 Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 22:08:36 -0800
Subject: [PATCH 3/6] content [nfc]: Move static _parseMath out from parser
 class

Because this is used for parsing both inline and block nodes,
moving it out will help us separate the rest of the parser code
between the handling of inline content and block content.
---
 lib/model/content.dart | 92 +++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/lib/model/content.dart b/lib/model/content.dart
index 82f24beb54..e21a4e9825 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -806,6 +806,52 @@ class GlobalTimeNode extends InlineContentNode {
 
 ////////////////////////////////////////////////////////////////
 
+String? _parseMath(dom.Element element, {required bool block}) {
+  final dom.Element katexElement;
+  if (!block) {
+    assert(element.localName == 'span' && element.className == 'katex');
+
+    katexElement = element;
+  } else {
+    assert(element.localName == 'span' && element.className == 'katex-display');
+
+    if (element.nodes.length != 1) return null;
+    final child = element.nodes.single;
+    if (child is! dom.Element) return null;
+    if (child.localName != 'span') return null;
+    if (child.className != 'katex') return null;
+    katexElement = child;
+  }
+
+  // Expect two children span.katex-mathml, span.katex-html .
+  // For now we only care about the .katex-mathml .
+  if (katexElement.nodes.isEmpty) return null;
+  final child = katexElement.nodes.first;
+  if (child is! dom.Element) return null;
+  if (child.localName != 'span') return null;
+  if (child.className != 'katex-mathml') return null;
+
+  if (child.nodes.length != 1) return null;
+  final grandchild = child.nodes.single;
+  if (grandchild is! dom.Element) return null;
+  if (grandchild.localName != 'math') return null;
+  if (grandchild.attributes['display'] != (block ? 'block' : null)) return null;
+  if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null;
+
+  if (grandchild.nodes.length != 1) return null;
+  final greatgrand = grandchild.nodes.single;
+  if (greatgrand is! dom.Element) return null;
+  if (greatgrand.localName != 'semantics') return null;
+
+  if (greatgrand.nodes.isEmpty) return null;
+  final descendant4 = greatgrand.nodes.last;
+  if (descendant4 is! dom.Element) return null;
+  if (descendant4.localName != 'annotation') return null;
+  if (descendant4.attributes['encoding'] != 'application/x-tex') return null;
+
+  return descendant4.text.trim();
+}
+
 /// What sort of nodes a [_ZulipContentParser] is currently expecting to find.
 enum _ParserContext {
   /// The parser is currently looking for block nodes.
@@ -825,52 +871,6 @@ class _ZulipContentParser {
   /// and should be read or updated only inside an assertion.
   _ParserContext _debugParserContext = _ParserContext.block;
 
-  static String? _parseMath(dom.Element element, {required bool block}) {
-    final dom.Element katexElement;
-    if (!block) {
-      assert(element.localName == 'span' && element.className == 'katex');
-
-      katexElement = element;
-    } else {
-      assert(element.localName == 'span' && element.className == 'katex-display');
-
-      if (element.nodes.length != 1) return null;
-      final child = element.nodes.single;
-      if (child is! dom.Element) return null;
-      if (child.localName != 'span') return null;
-      if (child.className != 'katex') return null;
-      katexElement = child;
-    }
-
-    // Expect two children span.katex-mathml, span.katex-html .
-    // For now we only care about the .katex-mathml .
-    if (katexElement.nodes.isEmpty) return null;
-    final child = katexElement.nodes.first;
-    if (child is! dom.Element) return null;
-    if (child.localName != 'span') return null;
-    if (child.className != 'katex-mathml') return null;
-
-    if (child.nodes.length != 1) return null;
-    final grandchild = child.nodes.single;
-    if (grandchild is! dom.Element) return null;
-    if (grandchild.localName != 'math') return null;
-    if (grandchild.attributes['display'] != (block ? 'block' : null)) return null;
-    if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null;
-
-    if (grandchild.nodes.length != 1) return null;
-    final greatgrand = grandchild.nodes.single;
-    if (greatgrand is! dom.Element) return null;
-    if (greatgrand.localName != 'semantics') return null;
-
-    if (greatgrand.nodes.isEmpty) return null;
-    final descendant4 = greatgrand.nodes.last;
-    if (descendant4 is! dom.Element) return null;
-    if (descendant4.localName != 'annotation') return null;
-    if (descendant4.attributes['encoding'] != 'application/x-tex') return null;
-
-    return descendant4.text.trim();
-  }
-
   String? parseInlineMath(dom.Element element) {
     assert(_debugParserContext == _ParserContext.inline);
     return _parseMath(element, block: false);

From cb9d18593b33671c2862cb6d4aa46ab6e768101a Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 21:24:55 -0800
Subject: [PATCH 4/6] content [nfc]: Split out _ZulipInlineContentParser class

This helps organize the parsing code a bit more cleanly -- not to
mention more statically, by making the dynamic _debugParserContext
assertions redundant.
---
 lib/model/content.dart | 68 ++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 43 deletions(-)

diff --git a/lib/model/content.dart b/lib/model/content.dart
index e21a4e9825..39b5f549e0 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -786,7 +786,7 @@ class MathInlineNode extends InlineContentNode {
 class GlobalTimeNode extends InlineContentNode {
   const GlobalTimeNode({super.debugHtmlNode, required this.datetime});
 
-  /// Always in UTC, enforced in [_ZulipContentParser.parseInlineContent].
+  /// Always in UTC, enforced in [_ZulipInlineContentParser.parseInlineContent].
   final DateTime datetime;
 
   @override
@@ -852,32 +852,19 @@ String? _parseMath(dom.Element element, {required bool block}) {
   return descendant4.text.trim();
 }
 
-/// What sort of nodes a [_ZulipContentParser] is currently expecting to find.
-enum _ParserContext {
-  /// The parser is currently looking for block nodes.
-  block,
-
-  /// The parser is currently looking for inline nodes.
-  inline,
-}
-
-/// Parser for a complete piece of Zulip HTML content, a [ZulipContent].
+/// Parser for the inline-content subtrees within Zulip content HTML.
 ///
-/// The only entry point to this class is [parse].
-class _ZulipContentParser {
-  /// The current state of what sort of nodes the parser is looking for.
-  ///
-  /// This exists for the sake of debug-mode checks,
-  /// and should be read or updated only inside an assertion.
-  _ParserContext _debugParserContext = _ParserContext.block;
-
+/// The only entry point to this class is [parseBlockInline].
+///
+/// After a call to [parseBlockInline] returns, the [_ZulipInlineContentParser]
+/// instance has been reset to its starting state, and can be re-used for
+/// parsing other subtrees.
+class _ZulipInlineContentParser {
   String? parseInlineMath(dom.Element element) {
-    assert(_debugParserContext == _ParserContext.inline);
     return _parseMath(element, block: false);
   }
 
   UserMentionNode? parseUserMention(dom.Element element) {
-    assert(_debugParserContext == _ParserContext.inline);
     assert(element.localName == 'span');
     final debugHtmlNode = kDebugMode ? element : null;
 
@@ -951,7 +938,6 @@ class _ZulipContentParser {
   static final _emojiCodeFromClassNameRegexp = RegExp(r"emoji-([^ ]+)");
 
   InlineContentNode parseInlineContent(dom.Node node) {
-    assert(_debugParserContext == _ParserContext.inline);
     final debugHtmlNode = kDebugMode ? node : null;
     InlineContentNode unimplemented() => UnimplementedInlineContentNode(htmlNode: node);
 
@@ -1041,33 +1027,38 @@ class _ZulipContentParser {
   }
 
   List<InlineContentNode> parseInlineContentList(List<dom.Node> nodes) {
-    assert(_debugParserContext == _ParserContext.inline);
     return nodes.map(parseInlineContent).toList(growable: false);
   }
 
   /// Parse the children of a [BlockInlineContainerNode], making up a
   /// complete subtree of inline content with no further inline ancestors.
   ({List<InlineContentNode> nodes, List<LinkNode>? links}) parseBlockInline(List<dom.Node> nodes) {
-    assert(_debugParserContext == _ParserContext.block);
-    assert(() {
-      _debugParserContext = _ParserContext.inline;
-      return true;
-    }());
     final resultNodes = parseInlineContentList(nodes);
-    assert(() {
-      _debugParserContext = _ParserContext.block;
-      return true;
-    }());
     return (nodes: resultNodes, links: _takeLinkNodes());
   }
+}
+
+/// Parser for a complete piece of Zulip HTML content, a [ZulipContent].
+///
+/// The only entry point to this class is [parse].
+class _ZulipContentParser {
+  /// The single inline-content parser used and re-used throughout parsing of
+  /// a complete piece of Zulip HTML content.
+  ///
+  /// Because block content can never appear nested inside inline content,
+  /// there's never a need for more than one of these at a time,
+  /// so we can allocate just one up front.
+  final inlineParser = _ZulipInlineContentParser();
+
+  ({List<InlineContentNode> nodes, List<LinkNode>? links}) parseBlockInline(List<dom.Node> nodes) {
+    return inlineParser.parseBlockInline(nodes);
+  }
 
   String? parseMathBlock(dom.Element element) {
-    assert(_debugParserContext == _ParserContext.block);
     return _parseMath(element, block: true);
   }
 
   BlockContentNode parseListNode(dom.Element element) {
-    assert(_debugParserContext == _ParserContext.block);
     ListStyle? listStyle;
     switch (element.localName) {
       case 'ol': listStyle = ListStyle.ordered; break;
@@ -1090,7 +1081,6 @@ class _ZulipContentParser {
   }
 
   BlockContentNode parseSpoilerNode(dom.Element divElement) {
-    assert(_debugParserContext == _ParserContext.block);
     assert(divElement.localName == 'div'
         && divElement.className == 'spoiler-block');
 
@@ -1110,7 +1100,6 @@ class _ZulipContentParser {
   }
 
   BlockContentNode parseCodeBlock(dom.Element divElement) {
-    assert(_debugParserContext == _ParserContext.block);
     final mainElement = () {
       assert(divElement.localName == 'div'
           && divElement.className == "codehilite");
@@ -1193,7 +1182,6 @@ class _ZulipContentParser {
   static final _imageDimensionsRegExp = RegExp(r'^(\d+)x(\d+)$');
 
   BlockContentNode parseImageNode(dom.Element divElement) {
-    assert(_debugParserContext == _ParserContext.block);
     final elements = () {
       assert(divElement.localName == 'div'
           && divElement.className == 'message_inline_image');
@@ -1285,7 +1273,6 @@ class _ZulipContentParser {
   }();
 
   BlockContentNode parseInlineVideoNode(dom.Element divElement) {
-    assert(_debugParserContext == _ParserContext.block);
     assert(divElement.localName == 'div'
       && _videoClassNameRegexp.hasMatch(divElement.className));
 
@@ -1318,7 +1305,6 @@ class _ZulipContentParser {
   }
 
   BlockContentNode parseEmbedVideoNode(dom.Element divElement) {
-    assert(_debugParserContext == _ParserContext.block);
     assert(divElement.localName == 'div'
       && _videoClassNameRegexp.hasMatch(divElement.className));
 
@@ -1357,7 +1343,6 @@ class _ZulipContentParser {
   }
 
   BlockContentNode parseTableContent(dom.Element tableElement) {
-    assert(_debugParserContext == _ParserContext.block);
     assert(tableElement.localName == 'table'
         && tableElement.className.isEmpty);
 
@@ -1465,7 +1450,6 @@ class _ZulipContentParser {
   }
 
   BlockContentNode parseBlockContent(dom.Node node) {
-    assert(_debugParserContext == _ParserContext.block);
     final debugHtmlNode = kDebugMode ? node : null;
     if (node is! dom.Element) {
       return UnimplementedBlockContentNode(htmlNode: node);
@@ -1592,7 +1576,6 @@ class _ZulipContentParser {
   ///
   /// See [ParagraphNode].
   List<BlockContentNode> parseImplicitParagraphBlockContentList(dom.NodeList nodes) {
-    assert(_debugParserContext == _ParserContext.block);
     final List<BlockContentNode> result = [];
     final List<dom.Node> currentParagraph = [];
     List<ImageNode> imageNodes = [];
@@ -1641,7 +1624,6 @@ class _ZulipContentParser {
   static final _redundantLineBreaksRegexp = RegExp(r'^\n+$');
 
   List<BlockContentNode> parseBlockContentList(dom.NodeList nodes) {
-    assert(_debugParserContext == _ParserContext.block);
     final List<BlockContentNode> result = [];
     List<ImageNode> imageNodes = [];
     for (final node in nodes) {

From 612c91eb6f9adcecbe16d8774c287d98b7983e20 Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 22:56:01 -0800
Subject: [PATCH 5/6] content [nfc]: Encapsulate parsing math nodes a bit more

---
 lib/model/content.dart | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/lib/model/content.dart b/lib/model/content.dart
index 39b5f549e0..2ea790796b 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -860,8 +860,11 @@ String? _parseMath(dom.Element element, {required bool block}) {
 /// instance has been reset to its starting state, and can be re-used for
 /// parsing other subtrees.
 class _ZulipInlineContentParser {
-  String? parseInlineMath(dom.Element element) {
-    return _parseMath(element, block: false);
+  InlineContentNode? parseInlineMath(dom.Element element) {
+    final debugHtmlNode = kDebugMode ? element : null;
+    final texSource = _parseMath(element, block: false);
+    if (texSource == null) return null;
+    return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
   }
 
   UserMentionNode? parseUserMention(dom.Element element) {
@@ -1017,9 +1020,7 @@ class _ZulipInlineContentParser {
     }
 
     if (localName == 'span' && className == 'katex') {
-      final texSource = parseInlineMath(element);
-      if (texSource == null) return unimplemented();
-      return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
+      return parseInlineMath(element) ?? unimplemented();
     }
 
     // TODO more types of node
@@ -1054,8 +1055,11 @@ class _ZulipContentParser {
     return inlineParser.parseBlockInline(nodes);
   }
 
-  String? parseMathBlock(dom.Element element) {
-    return _parseMath(element, block: true);
+  BlockContentNode parseMathBlock(dom.Element element) {
+    final debugHtmlNode = kDebugMode ? element : null;
+    final texSource = _parseMath(element, block: true);
+    if (texSource == null) return UnimplementedBlockContentNode(htmlNode: element);
+    return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
   }
 
   BlockContentNode parseListNode(dom.Element element) {
@@ -1477,9 +1481,7 @@ class _ZulipContentParser {
             // The case with the `<br>\n` can happen when at the end of a quote;
             // it seems like a glitch in the server's Markdown processing,
             // so hopefully there just aren't any further such glitches.
-            final texSource = parseMathBlock(child);
-            if (texSource == null) return UnimplementedBlockContentNode(htmlNode: node);
-            return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
+            return parseMathBlock(child);
           }
         }
       }

From 35d83f2192b3b6ae41b89d9ed2367e738e9ff12c Mon Sep 17 00:00:00 2001
From: Greg Price <greg@zulip.com>
Date: Thu, 9 Jan 2025 23:12:16 -0800
Subject: [PATCH 6/6] content [nfc]: Factor out consumeImageNodes in
 block-content parse loops

This hopefully makes the logic of these loops a bit more readable.
---
 lib/model/content.dart | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/lib/model/content.dart b/lib/model/content.dart
index 2ea790796b..e228163a2e 100644
--- a/lib/model/content.dart
+++ b/lib/model/content.dart
@@ -1579,8 +1579,14 @@ class _ZulipContentParser {
   /// See [ParagraphNode].
   List<BlockContentNode> parseImplicitParagraphBlockContentList(dom.NodeList nodes) {
     final List<BlockContentNode> result = [];
-    final List<dom.Node> currentParagraph = [];
+
     List<ImageNode> imageNodes = [];
+    void consumeImageNodes() {
+      result.add(ImageNodeList(imageNodes));
+      imageNodes = [];
+    }
+
+    final List<dom.Node> currentParagraph = [];
     void consumeParagraph() {
       final parsed = parseBlockInline(currentParagraph);
       result.add(ParagraphNode(
@@ -1595,8 +1601,7 @@ class _ZulipContentParser {
 
       if (_isPossibleInlineNode(node)) {
         if (imageNodes.isNotEmpty) {
-          result.add(ImageNodeList(imageNodes));
-          imageNodes = [];
+          consumeImageNodes();
           // In a context where paragraphs are implicit it should be impossible
           // to have more paragraph content after image previews.
           result.add(UnimplementedBlockContentNode(htmlNode: node));
@@ -1611,15 +1616,11 @@ class _ZulipContentParser {
         imageNodes.add(block);
         continue;
       }
-      if (imageNodes.isNotEmpty) {
-        result.add(ImageNodeList(imageNodes));
-        imageNodes = [];
-      }
+      if (imageNodes.isNotEmpty) consumeImageNodes();
       result.add(block);
     }
     if (currentParagraph.isNotEmpty) consumeParagraph();
-    if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes));
-
+    if (imageNodes.isNotEmpty) consumeImageNodes();
     return result;
   }
 
@@ -1627,7 +1628,13 @@ class _ZulipContentParser {
 
   List<BlockContentNode> parseBlockContentList(dom.NodeList nodes) {
     final List<BlockContentNode> result = [];
+
     List<ImageNode> imageNodes = [];
+    void consumeImageNodes() {
+      result.add(ImageNodeList(imageNodes));
+      imageNodes = [];
+    }
+
     for (final node in nodes) {
       // We get a bunch of newline Text nodes between paragraphs.
       // A browser seems to ignore these; let's do the same.
@@ -1640,13 +1647,10 @@ class _ZulipContentParser {
         imageNodes.add(block);
         continue;
       }
-      if (imageNodes.isNotEmpty) {
-        result.add(ImageNodeList(imageNodes));
-        imageNodes = [];
-      }
+      if (imageNodes.isNotEmpty) consumeImageNodes();
       result.add(block);
     }
-    if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes));
+    if (imageNodes.isNotEmpty) consumeImageNodes();
     return result;
   }