Skip to content

Commit b06977a

Browse files
Archmongerrmorshea
andauthored
Use LXML for html_to_vdom (#795)
Co-authored-by: Ryan Morshead <[email protected]>
1 parent 2f0bb98 commit b06977a

File tree

6 files changed

+228
-94
lines changed

6 files changed

+228
-94
lines changed

docs/source/about/changelog.rst

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Unreleased
2525

2626
**Fixed**
2727

28+
- :issue:`777` - Fix edge cases where ``html_to_vdom`` can fail to convert HTML
2829
- :issue:`789` - Conditionally rendered components cannot use contexts
2930
- :issue:`773` - Use strict equality check for text, numeric, and binary types in hooks
3031
- :issue:`801` - Accidental mutation of old model causes invalid JSON Patch
@@ -38,6 +39,7 @@ Unreleased
3839
**Added**
3940

4041
- :pull:`123` - ``asgiref`` as a dependency
42+
- :pull:`795` - ``lxml`` as a dependency
4143

4244

4345
v0.39.0

requirements/pkg-deps.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ fastjsonschema >=2.14.5
66
requests >=2
77
colorlog >=6
88
asgiref >=3
9+
lxml >= 4

src/idom/backend/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def run(
3535
implementation: BackendImplementation[Any] | None = None,
3636
) -> None:
3737
"""Run a component with a development server"""
38-
logger.warn(
38+
logger.warning(
3939
"You are running a development server. "
4040
"Change this before deploying in production!"
4141
)

src/idom/utils.py

+160-82
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1-
from html.parser import HTMLParser as _HTMLParser
2-
from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar
1+
from itertools import chain
2+
from typing import Any, Callable, Generic, Iterable, List, TypeVar, Union
3+
4+
from lxml import etree
5+
from lxml.html import fragments_fromstring
6+
7+
import idom
8+
from idom.core.types import VdomDict
39

410

511
_RefValue = TypeVar("_RefValue")
12+
_ModelTransform = Callable[[VdomDict], Any]
613
_UNDEFINED: Any = object()
714

815

@@ -49,11 +56,9 @@ def __repr__(self) -> str:
4956
return f"{type(self).__name__}({current})"
5057

5158

52-
_ModelTransform = Callable[[Dict[str, Any]], Any]
53-
54-
55-
def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
56-
"""Transform HTML into a DOM model
59+
def html_to_vdom(html: str, *transforms: _ModelTransform, strict: bool = True) -> VdomDict:
60+
"""Transform HTML into a DOM model. Unique keys can be provided to HTML elements
61+
using a ``key=...`` attribute within your HTML tag.
5762
5863
Parameters:
5964
source:
@@ -62,81 +67,154 @@ def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
6267
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
6368
dictionary which will be replaced by ``new``. For example, you could use a
6469
transform function to add highlighting to a ``<code/>`` block.
70+
strict:
71+
If ``True``, raise an exception if the HTML does not perfectly follow HTML5
72+
syntax.
6573
"""
66-
parser = HtmlParser()
67-
parser.feed(source)
68-
root = parser.model()
69-
to_visit = [root]
70-
while to_visit:
71-
node = to_visit.pop(0)
72-
if isinstance(node, dict) and "children" in node:
73-
transformed = []
74-
for child in node["children"]:
75-
if isinstance(child, dict):
76-
for t in transforms:
77-
child = t(child)
78-
if child is not None:
79-
transformed.append(child)
80-
to_visit.append(child)
81-
node["children"] = transformed
82-
if "attributes" in node and not node["attributes"]:
83-
del node["attributes"]
84-
if "children" in node and not node["children"]:
85-
del node["children"]
86-
return root
87-
88-
89-
class HtmlParser(_HTMLParser):
90-
"""HTML to VDOM parser
91-
92-
Example:
93-
94-
.. code-block::
95-
96-
parser = HtmlParser()
97-
98-
parser.feed(an_html_string)
99-
parser.feed(another_html_string)
100-
...
101-
102-
vdom = parser.model()
74+
if not isinstance(html, str): # pragma: no cover
75+
raise TypeError(f"Expected html to be a string, not {type(html).__name__}")
76+
77+
# If the user provided a string, convert it to a list of lxml.etree nodes
78+
parser = etree.HTMLParser(
79+
remove_comments=True,
80+
remove_pis=True,
81+
remove_blank_text=True,
82+
recover=not strict,
83+
)
84+
try:
85+
nodes: List = fragments_fromstring(html, no_leading_text=True, parser=parser)
86+
except etree.XMLSyntaxError as e:
87+
if not strict:
88+
raise e # pragma: no cover
89+
raise HTMLParseError(
90+
"An error has occurred while parsing the HTML.\n\n"
91+
"This HTML may be malformatted, or may not perfectly adhere to HTML5.\n"
92+
"If you believe the exception above was due to something intentional, "
93+
"you can disable the strict parameter on html_to_vdom().\n"
94+
"Otherwise, repair your broken HTML and try again."
95+
) from e
96+
has_root_node = len(nodes) == 1
97+
98+
# Find or create a root node
99+
if has_root_node:
100+
root_node = nodes[0]
101+
else:
102+
# etree.Element requires a non-empty tag - we correct this below
103+
root_node = etree.Element("TEMP", None, None)
104+
for child in nodes:
105+
root_node.append(child)
106+
107+
# Convert the lxml node to a VDOM dict
108+
vdom = _etree_to_vdom(root_node, transforms)
109+
110+
# Change the artificially created root node to a React Fragment, instead of a div
111+
if not has_root_node:
112+
vdom["tagName"] = ""
113+
114+
return vdom
115+
116+
117+
def _etree_to_vdom(
118+
node: etree._Element, transforms: Iterable[_ModelTransform]
119+
) -> VdomDict:
120+
"""Recusively transform an lxml etree node into a DOM model
121+
122+
Parameters:
123+
source:
124+
The ``lxml.etree._Element`` node
125+
transforms:
126+
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
127+
dictionary which will be replaced by ``new``. For example, you could use a
128+
transform function to add highlighting to a ``<code/>`` block.
103129
"""
130+
if not isinstance(node, etree._Element): # pragma: no cover
131+
raise TypeError(
132+
f"Expected node to be a etree._Element, not {type(node).__name__}"
133+
)
104134

105-
def model(self) -> Dict[str, Any]:
106-
"""Get the current state of parsed VDOM model"""
107-
return self._node_stack[0]
108-
109-
def feed(self, data: str) -> None:
110-
"""Feed in HTML that will update the :meth:`HtmlParser.model`"""
111-
self._node_stack.append(self._make_vdom("div", {}))
112-
super().feed(data)
113-
114-
def reset(self) -> None:
115-
"""Reset the state of the parser"""
116-
self._node_stack: List[Dict[str, Any]] = []
117-
super().reset()
118-
119-
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
120-
new = self._make_vdom(tag, dict(attrs))
121-
current = self._node_stack[-1]
122-
current["children"].append(new)
123-
self._node_stack.append(new)
124-
125-
def handle_endtag(self, tag: str) -> None:
126-
del self._node_stack[-1]
127-
128-
def handle_data(self, data: str) -> None:
129-
self._node_stack[-1]["children"].append(data)
130-
131-
@staticmethod
132-
def _make_vdom(tag: str, attrs: Dict[str, Any]) -> Dict[str, Any]:
133-
if "style" in attrs:
134-
style = attrs["style"]
135-
if isinstance(style, str):
136-
style_dict = {}
137-
for k, v in (part.split(":", 1) for part in style.split(";") if part):
138-
title_case_key = k.title().replace("-", "")
139-
camel_case_key = title_case_key[:1].lower() + title_case_key[1:]
140-
style_dict[camel_case_key] = v
141-
attrs["style"] = style_dict
142-
return {"tagName": tag, "attributes": attrs, "children": []}
135+
# This will recursively call _etree_to_vdom() on all children
136+
children = _generate_vdom_children(node, transforms)
137+
138+
# Convert the lxml node to a VDOM dict
139+
attributes = dict(node.items())
140+
key = attributes.pop("key", None)
141+
142+
if hasattr(idom.html, node.tag):
143+
vdom = getattr(idom.html, node.tag)(attributes, *children, key=key)
144+
else:
145+
vdom: VdomDict = {"tagName": node.tag}
146+
if children:
147+
vdom["children"] = children
148+
if attributes:
149+
vdom["attributes"] = attributes
150+
if key is not None:
151+
vdom["key"] = key
152+
153+
# Perform any necessary mutations on the VDOM attributes to meet VDOM spec
154+
_mutate_vdom(vdom)
155+
156+
# Apply any provided transforms.
157+
for transform in transforms:
158+
vdom = transform(vdom)
159+
160+
return vdom
161+
162+
163+
def _mutate_vdom(vdom: VdomDict):
164+
"""Performs any necessary mutations on the VDOM attributes to meet VDOM spec.
165+
166+
Currently, this function only transforms the ``style`` attribute into a dictionary whose keys are
167+
camelCase so as to be renderable by React.
168+
169+
This function may be extended in the future.
170+
"""
171+
# Determine if the style attribute needs to be converted to a dict
172+
if (
173+
"attributes" in vdom
174+
and "style" in vdom["attributes"]
175+
and isinstance(vdom["attributes"]["style"], str)
176+
):
177+
# Convince type checker that it's safe to mutate attributes
178+
assert isinstance(vdom["attributes"], dict)
179+
180+
# Convert style attribute from str -> dict with camelCase keys
181+
vdom["attributes"]["style"] = {
182+
_hypen_to_camel_case(key.strip()): value.strip()
183+
for key, value in (
184+
part.split(":", 1)
185+
for part in vdom["attributes"]["style"].split(";")
186+
if ":" in part
187+
)
188+
}
189+
190+
191+
def _generate_vdom_children(
192+
node: etree._Element, transforms: Iterable[_ModelTransform]
193+
) -> List[Union[VdomDict, str]]:
194+
"""Generates a list of VDOM children from an lxml node.
195+
196+
Inserts inner text and/or tail text inbetween VDOM children, if necessary.
197+
"""
198+
return ( # Get the inner text of the current node
199+
[node.text] if node.text else []
200+
) + list(
201+
chain(
202+
*(
203+
# Recursively convert each child node to VDOM
204+
[_etree_to_vdom(child, transforms)]
205+
# Insert the tail text between each child node
206+
+ ([child.tail] if child.tail else [])
207+
for child in node.iterchildren(None)
208+
)
209+
)
210+
)
211+
212+
213+
def _hypen_to_camel_case(string: str) -> str:
214+
"""Convert a hypenated string to camelCase."""
215+
first, _, remainder = string.partition("-")
216+
return first.lower() + remainder.title().replace("-", "")
217+
218+
219+
class HTMLParseError(etree.LxmlSyntaxError):
220+
"""Raised when an HTML document cannot be parsed using strict parsing."""

src/idom/widgets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def use_linked_inputs(
8080
value, set_value = idom.hooks.use_state(initial_value)
8181

8282
def sync_inputs(event: Dict[str, Any]) -> None:
83-
new_value = event["value"]
83+
new_value = event["target"]["value"]
8484
set_value(new_value)
8585
if not new_value and ignore_empty:
8686
return None

0 commit comments

Comments
 (0)