1
- from html .parser import HTMLParser as _HTMLParser
2
- from typing import Any , Callable , Dict , Generic , List , Optional , Tuple , TypeVar
1
+ from itertools import chain
2
+ from typing import Any , Callable , Generic , Iterable , List , TypeVar , Union
3
+
4
+ from lxml import etree
5
+ from lxml .html import fragments_fromstring
6
+
7
+ import idom
8
+ from idom .core .types import VdomDict
3
9
4
10
5
11
_RefValue = TypeVar ("_RefValue" )
12
+ _ModelTransform = Callable [[VdomDict ], Any ]
6
13
_UNDEFINED : Any = object ()
7
14
8
15
@@ -49,11 +56,9 @@ def __repr__(self) -> str:
49
56
return f"{ type (self ).__name__ } ({ current } )"
50
57
51
58
52
- _ModelTransform = Callable [[Dict [str , Any ]], Any ]
53
-
54
-
55
- def html_to_vdom (source : str , * transforms : _ModelTransform ) -> Dict [str , Any ]:
56
- """Transform HTML into a DOM model
59
+ def html_to_vdom (html : str , * transforms : _ModelTransform , strict : bool = True ) -> VdomDict :
60
+ """Transform HTML into a DOM model. Unique keys can be provided to HTML elements
61
+ using a ``key=...`` attribute within your HTML tag.
57
62
58
63
Parameters:
59
64
source:
@@ -62,81 +67,154 @@ def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
62
67
Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
63
68
dictionary which will be replaced by ``new``. For example, you could use a
64
69
transform function to add highlighting to a ``<code/>`` block.
70
+ strict:
71
+ If ``True``, raise an exception if the HTML does not perfectly follow HTML5
72
+ syntax.
65
73
"""
66
- parser = HtmlParser ()
67
- parser .feed (source )
68
- root = parser .model ()
69
- to_visit = [root ]
70
- while to_visit :
71
- node = to_visit .pop (0 )
72
- if isinstance (node , dict ) and "children" in node :
73
- transformed = []
74
- for child in node ["children" ]:
75
- if isinstance (child , dict ):
76
- for t in transforms :
77
- child = t (child )
78
- if child is not None :
79
- transformed .append (child )
80
- to_visit .append (child )
81
- node ["children" ] = transformed
82
- if "attributes" in node and not node ["attributes" ]:
83
- del node ["attributes" ]
84
- if "children" in node and not node ["children" ]:
85
- del node ["children" ]
86
- return root
87
-
88
-
89
- class HtmlParser (_HTMLParser ):
90
- """HTML to VDOM parser
91
-
92
- Example:
93
-
94
- .. code-block::
95
-
96
- parser = HtmlParser()
97
-
98
- parser.feed(an_html_string)
99
- parser.feed(another_html_string)
100
- ...
101
-
102
- vdom = parser.model()
74
+ if not isinstance (html , str ): # pragma: no cover
75
+ raise TypeError (f"Expected html to be a string, not { type (html ).__name__ } " )
76
+
77
+ # If the user provided a string, convert it to a list of lxml.etree nodes
78
+ parser = etree .HTMLParser (
79
+ remove_comments = True ,
80
+ remove_pis = True ,
81
+ remove_blank_text = True ,
82
+ recover = not strict ,
83
+ )
84
+ try :
85
+ nodes : List = fragments_fromstring (html , no_leading_text = True , parser = parser )
86
+ except etree .XMLSyntaxError as e :
87
+ if not strict :
88
+ raise e # pragma: no cover
89
+ raise HTMLParseError (
90
+ "An error has occurred while parsing the HTML.\n \n "
91
+ "This HTML may be malformatted, or may not perfectly adhere to HTML5.\n "
92
+ "If you believe the exception above was due to something intentional, "
93
+ "you can disable the strict parameter on html_to_vdom().\n "
94
+ "Otherwise, repair your broken HTML and try again."
95
+ ) from e
96
+ has_root_node = len (nodes ) == 1
97
+
98
+ # Find or create a root node
99
+ if has_root_node :
100
+ root_node = nodes [0 ]
101
+ else :
102
+ # etree.Element requires a non-empty tag - we correct this below
103
+ root_node = etree .Element ("TEMP" , None , None )
104
+ for child in nodes :
105
+ root_node .append (child )
106
+
107
+ # Convert the lxml node to a VDOM dict
108
+ vdom = _etree_to_vdom (root_node , transforms )
109
+
110
+ # Change the artificially created root node to a React Fragment, instead of a div
111
+ if not has_root_node :
112
+ vdom ["tagName" ] = ""
113
+
114
+ return vdom
115
+
116
+
117
+ def _etree_to_vdom (
118
+ node : etree ._Element , transforms : Iterable [_ModelTransform ]
119
+ ) -> VdomDict :
120
+ """Recusively transform an lxml etree node into a DOM model
121
+
122
+ Parameters:
123
+ source:
124
+ The ``lxml.etree._Element`` node
125
+ transforms:
126
+ Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
127
+ dictionary which will be replaced by ``new``. For example, you could use a
128
+ transform function to add highlighting to a ``<code/>`` block.
103
129
"""
130
+ if not isinstance (node , etree ._Element ): # pragma: no cover
131
+ raise TypeError (
132
+ f"Expected node to be a etree._Element, not { type (node ).__name__ } "
133
+ )
104
134
105
- def model (self ) -> Dict [str , Any ]:
106
- """Get the current state of parsed VDOM model"""
107
- return self ._node_stack [0 ]
108
-
109
- def feed (self , data : str ) -> None :
110
- """Feed in HTML that will update the :meth:`HtmlParser.model`"""
111
- self ._node_stack .append (self ._make_vdom ("div" , {}))
112
- super ().feed (data )
113
-
114
- def reset (self ) -> None :
115
- """Reset the state of the parser"""
116
- self ._node_stack : List [Dict [str , Any ]] = []
117
- super ().reset ()
118
-
119
- def handle_starttag (self , tag : str , attrs : List [Tuple [str , Optional [str ]]]) -> None :
120
- new = self ._make_vdom (tag , dict (attrs ))
121
- current = self ._node_stack [- 1 ]
122
- current ["children" ].append (new )
123
- self ._node_stack .append (new )
124
-
125
- def handle_endtag (self , tag : str ) -> None :
126
- del self ._node_stack [- 1 ]
127
-
128
- def handle_data (self , data : str ) -> None :
129
- self ._node_stack [- 1 ]["children" ].append (data )
130
-
131
- @staticmethod
132
- def _make_vdom (tag : str , attrs : Dict [str , Any ]) -> Dict [str , Any ]:
133
- if "style" in attrs :
134
- style = attrs ["style" ]
135
- if isinstance (style , str ):
136
- style_dict = {}
137
- for k , v in (part .split (":" , 1 ) for part in style .split (";" ) if part ):
138
- title_case_key = k .title ().replace ("-" , "" )
139
- camel_case_key = title_case_key [:1 ].lower () + title_case_key [1 :]
140
- style_dict [camel_case_key ] = v
141
- attrs ["style" ] = style_dict
142
- return {"tagName" : tag , "attributes" : attrs , "children" : []}
135
+ # This will recursively call _etree_to_vdom() on all children
136
+ children = _generate_vdom_children (node , transforms )
137
+
138
+ # Convert the lxml node to a VDOM dict
139
+ attributes = dict (node .items ())
140
+ key = attributes .pop ("key" , None )
141
+
142
+ if hasattr (idom .html , node .tag ):
143
+ vdom = getattr (idom .html , node .tag )(attributes , * children , key = key )
144
+ else :
145
+ vdom : VdomDict = {"tagName" : node .tag }
146
+ if children :
147
+ vdom ["children" ] = children
148
+ if attributes :
149
+ vdom ["attributes" ] = attributes
150
+ if key is not None :
151
+ vdom ["key" ] = key
152
+
153
+ # Perform any necessary mutations on the VDOM attributes to meet VDOM spec
154
+ _mutate_vdom (vdom )
155
+
156
+ # Apply any provided transforms.
157
+ for transform in transforms :
158
+ vdom = transform (vdom )
159
+
160
+ return vdom
161
+
162
+
163
+ def _mutate_vdom (vdom : VdomDict ):
164
+ """Performs any necessary mutations on the VDOM attributes to meet VDOM spec.
165
+
166
+ Currently, this function only transforms the ``style`` attribute into a dictionary whose keys are
167
+ camelCase so as to be renderable by React.
168
+
169
+ This function may be extended in the future.
170
+ """
171
+ # Determine if the style attribute needs to be converted to a dict
172
+ if (
173
+ "attributes" in vdom
174
+ and "style" in vdom ["attributes" ]
175
+ and isinstance (vdom ["attributes" ]["style" ], str )
176
+ ):
177
+ # Convince type checker that it's safe to mutate attributes
178
+ assert isinstance (vdom ["attributes" ], dict )
179
+
180
+ # Convert style attribute from str -> dict with camelCase keys
181
+ vdom ["attributes" ]["style" ] = {
182
+ _hypen_to_camel_case (key .strip ()): value .strip ()
183
+ for key , value in (
184
+ part .split (":" , 1 )
185
+ for part in vdom ["attributes" ]["style" ].split (";" )
186
+ if ":" in part
187
+ )
188
+ }
189
+
190
+
191
+ def _generate_vdom_children (
192
+ node : etree ._Element , transforms : Iterable [_ModelTransform ]
193
+ ) -> List [Union [VdomDict , str ]]:
194
+ """Generates a list of VDOM children from an lxml node.
195
+
196
+ Inserts inner text and/or tail text inbetween VDOM children, if necessary.
197
+ """
198
+ return ( # Get the inner text of the current node
199
+ [node .text ] if node .text else []
200
+ ) + list (
201
+ chain (
202
+ * (
203
+ # Recursively convert each child node to VDOM
204
+ [_etree_to_vdom (child , transforms )]
205
+ # Insert the tail text between each child node
206
+ + ([child .tail ] if child .tail else [])
207
+ for child in node .iterchildren (None )
208
+ )
209
+ )
210
+ )
211
+
212
+
213
+ def _hypen_to_camel_case (string : str ) -> str :
214
+ """Convert a hypenated string to camelCase."""
215
+ first , _ , remainder = string .partition ("-" )
216
+ return first .lower () + remainder .title ().replace ("-" , "" )
217
+
218
+
219
+ class HTMLParseError (etree .LxmlSyntaxError ):
220
+ """Raised when an HTML document cannot be parsed using strict parsing."""
0 commit comments