Use LXML for html_to_vdom (#795)

Archmonger · rmorshea · web-flow · commit b06977a10213 · 2022-08-13T20:41:59.000-07:00
Co-authored-by: Ryan Morshead &lt;ryan.morshead@gmail.com&gt;
diff --git a/docs/source/about/changelog.rst b/docs/source/about/changelog.rst
@@ -25,6 +25,7 @@ Unreleased
 
 **Fixed**
 
+- :issue:`777` - Fix edge cases where ``html_to_vdom`` can fail to convert HTML
 - :issue:`789` - Conditionally rendered components cannot use contexts
 - :issue:`773` - Use strict equality check for text, numeric, and binary types in hooks
 - :issue:`801` - Accidental mutation of old model causes invalid JSON Patch
@@ -38,6 +39,7 @@ Unreleased
 **Added**
 
 - :pull:`123` - ``asgiref`` as a dependency
+- :pull:`795` - ``lxml`` as a dependency
 
 
 v0.39.0
diff --git a/requirements/pkg-deps.txt b/requirements/pkg-deps.txt
@@ -6,3 +6,4 @@ fastjsonschema >=2.14.5
 requests >=2
 colorlog >=6
 asgiref >=3
+lxml >= 4
diff --git a/src/idom/backend/utils.py b/src/idom/backend/utils.py
@@ -35,7 +35,7 @@ def run(
     implementation: BackendImplementation[Any] | None = None,
 ) -> None:
     """Run a component with a development server"""
-    logger.warn(
+    logger.warning(
         "You are running a development server. "
         "Change this before deploying in production!"
     )
diff --git a/src/idom/utils.py b/src/idom/utils.py
@@ -1,8 +1,15 @@
-from html.parser import HTMLParser as _HTMLParser
-from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar
+from itertools import chain
+from typing import Any, Callable, Generic, Iterable, List, TypeVar, Union
+
+from lxml import etree
+from lxml.html import fragments_fromstring
+
+import idom
+from idom.core.types import VdomDict
 
 
 _RefValue = TypeVar("_RefValue")
+_ModelTransform = Callable[[VdomDict], Any]
 _UNDEFINED: Any = object()
 
 
@@ -49,11 +56,9 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}({current})"
 
 
-_ModelTransform = Callable[[Dict[str, Any]], Any]
-
-
-def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
-    """Transform HTML into a DOM model
+def html_to_vdom(html: str, *transforms: _ModelTransform, strict: bool = True) -> VdomDict:
+    """Transform HTML into a DOM model. Unique keys can be provided to HTML elements
+    using a ``key=...`` attribute within your HTML tag.
 
     Parameters:
         source:
@@ -62,81 +67,154 @@ def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
             Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
             dictionary which will be replaced by ``new``. For example, you could use a
             transform function to add highlighting to a ``<code/>`` block.
+        strict:
+            If ``True``, raise an exception if the HTML does not perfectly follow HTML5
+            syntax.
     """
-    parser = HtmlParser()
-    parser.feed(source)
-    root = parser.model()
-    to_visit = [root]
-    while to_visit:
-        node = to_visit.pop(0)
-        if isinstance(node, dict) and "children" in node:
-            transformed = []
-            for child in node["children"]:
-                if isinstance(child, dict):
-                    for t in transforms:
-                        child = t(child)
-                if child is not None:
-                    transformed.append(child)
-                    to_visit.append(child)
-            node["children"] = transformed
-            if "attributes" in node and not node["attributes"]:
-                del node["attributes"]
-            if "children" in node and not node["children"]:
-                del node["children"]
-    return root
-
-
-class HtmlParser(_HTMLParser):
-    """HTML to VDOM parser
-
-    Example:
-
-        .. code-block::
-
-            parser = HtmlParser()
-
-            parser.feed(an_html_string)
-            parser.feed(another_html_string)
-            ...
-
-            vdom = parser.model()
+    if not isinstance(html, str):  # pragma: no cover
+        raise TypeError(f"Expected html to be a string, not {type(html).__name__}")
+
+    # If the user provided a string, convert it to a list of lxml.etree nodes
+    parser = etree.HTMLParser(
+        remove_comments=True,
+        remove_pis=True,
+        remove_blank_text=True,
+        recover=not strict,
+    )
+    try:
+        nodes: List = fragments_fromstring(html, no_leading_text=True, parser=parser)
+    except etree.XMLSyntaxError as e:
+        if not strict:
+            raise e  # pragma: no cover
+        raise HTMLParseError(
+            "An error has occurred while parsing the HTML.\n\n"
+            "This HTML may be malformatted, or may not perfectly adhere to HTML5.\n"
+            "If you believe the exception above was due to something intentional, "
+            "you can disable the strict parameter on html_to_vdom().\n"
+            "Otherwise, repair your broken HTML and try again."
+        ) from e
+    has_root_node = len(nodes) == 1
+
+    # Find or create a root node
+    if has_root_node:
+        root_node = nodes[0]
+    else:
+        # etree.Element requires a non-empty tag - we correct this below
+        root_node = etree.Element("TEMP", None, None)
+        for child in nodes:
+            root_node.append(child)
+
+    # Convert the lxml node to a VDOM dict
+    vdom = _etree_to_vdom(root_node, transforms)
+
+    # Change the artificially created root node to a React Fragment, instead of a div
+    if not has_root_node:
+        vdom["tagName"] = ""
+
+    return vdom
+
+
+def _etree_to_vdom(
+    node: etree._Element, transforms: Iterable[_ModelTransform]
+) -> VdomDict:
+    """Recusively transform an lxml etree node into a DOM model
+
+    Parameters:
+        source:
+            The ``lxml.etree._Element`` node
+        transforms:
+            Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
+            dictionary which will be replaced by ``new``. For example, you could use a
+            transform function to add highlighting to a ``<code/>`` block.
     """
+    if not isinstance(node, etree._Element):  # pragma: no cover
+        raise TypeError(
+            f"Expected node to be a etree._Element, not {type(node).__name__}"
+        )
 
-    def model(self) -> Dict[str, Any]:
-        """Get the current state of parsed VDOM model"""
-        return self._node_stack[0]
-
-    def feed(self, data: str) -> None:
-        """Feed in HTML that will update the :meth:`HtmlParser.model`"""
-        self._node_stack.append(self._make_vdom("div", {}))
-        super().feed(data)
-
-    def reset(self) -> None:
-        """Reset the state of the parser"""
-        self._node_stack: List[Dict[str, Any]] = []
-        super().reset()
-
-    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
-        new = self._make_vdom(tag, dict(attrs))
-        current = self._node_stack[-1]
-        current["children"].append(new)
-        self._node_stack.append(new)
-
-    def handle_endtag(self, tag: str) -> None:
-        del self._node_stack[-1]
-
-    def handle_data(self, data: str) -> None:
-        self._node_stack[-1]["children"].append(data)
-
-    @staticmethod
-    def _make_vdom(tag: str, attrs: Dict[str, Any]) -> Dict[str, Any]:
-        if "style" in attrs:
-            style = attrs["style"]
-            if isinstance(style, str):
-                style_dict = {}
-                for k, v in (part.split(":", 1) for part in style.split(";") if part):
-                    title_case_key = k.title().replace("-", "")
-                    camel_case_key = title_case_key[:1].lower() + title_case_key[1:]
-                    style_dict[camel_case_key] = v
-                attrs["style"] = style_dict
-        return {"tagName": tag, "attributes": attrs, "children": []}
+    # This will recursively call _etree_to_vdom() on all children
+    children = _generate_vdom_children(node, transforms)
+
+    # Convert the lxml node to a VDOM dict
+    attributes = dict(node.items())
+    key = attributes.pop("key", None)
+
+    if hasattr(idom.html, node.tag):
+        vdom = getattr(idom.html, node.tag)(attributes, *children, key=key)
+    else:
+        vdom: VdomDict = {"tagName": node.tag}
+        if children:
+            vdom["children"] = children
+        if attributes:
+            vdom["attributes"] = attributes
+        if key is not None:
+            vdom["key"] = key
+
+    # Perform any necessary mutations on the VDOM attributes to meet VDOM spec
+    _mutate_vdom(vdom)
+
+    # Apply any provided transforms.
+    for transform in transforms:
+        vdom = transform(vdom)
+
+    return vdom
+
+
+def _mutate_vdom(vdom: VdomDict):
+    """Performs any necessary mutations on the VDOM attributes to meet VDOM spec.
+
+    Currently, this function only transforms the ``style`` attribute into a dictionary whose keys are
+    camelCase so as to be renderable by React.
+
+    This function may be extended in the future.
+    """
+    # Determine if the style attribute needs to be converted to a dict
+    if (
+        "attributes" in vdom
+        and "style" in vdom["attributes"]
+        and isinstance(vdom["attributes"]["style"], str)
+    ):
+        # Convince type checker that it's safe to mutate attributes
+        assert isinstance(vdom["attributes"], dict)
+
+        # Convert style attribute from str -> dict with camelCase keys
+        vdom["attributes"]["style"] = {
+            _hypen_to_camel_case(key.strip()): value.strip()
+            for key, value in (
+                part.split(":", 1)
+                for part in vdom["attributes"]["style"].split(";")
+                if ":" in part
+            )
+        }
+
+
+def _generate_vdom_children(
+    node: etree._Element, transforms: Iterable[_ModelTransform]
+) -> List[Union[VdomDict, str]]:
+    """Generates a list of VDOM children from an lxml node.
+
+    Inserts inner text and/or tail text inbetween VDOM children, if necessary.
+    """
+    return (  # Get the inner text of the current node
+        [node.text] if node.text else []
+    ) + list(
+        chain(
+            *(
+                # Recursively convert each child node to VDOM
+                [_etree_to_vdom(child, transforms)]
+                # Insert the tail text between each child node
+                + ([child.tail] if child.tail else [])
+                for child in node.iterchildren(None)
+            )
+        )
+    )
+
+
+def _hypen_to_camel_case(string: str) -> str:
+    """Convert a hypenated string to camelCase."""
+    first, _, remainder = string.partition("-")
+    return first.lower() + remainder.title().replace("-", "")
+
+
+class HTMLParseError(etree.LxmlSyntaxError):
+    """Raised when an HTML document cannot be parsed using strict parsing."""
diff --git a/src/idom/widgets.py b/src/idom/widgets.py
@@ -80,7 +80,7 @@ def use_linked_inputs(
     value, set_value = idom.hooks.use_state(initial_value)
 
     def sync_inputs(event: Dict[str, Any]) -> None:
-        new_value = event["value"]
+        new_value = event["target"]["value"]
         set_value(new_value)
         if not new_value and ignore_empty:
             return None
diff --git a/tests/test_utils.py b/tests/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def run(`
`35`	`35`	`implementation: BackendImplementation[Any] \| None = None,`
`36`	`36`	`) -> None:`
`37`	`37`	`"""Run a component with a development server"""`
`38`		`- logger.warn(`
	`38`	`+ logger.warning(`
`39`	`39`	`"You are running a development server. "`
`40`	`40`	`"Change this before deploying in production!"`
`41`	`41`	`)`